# liquidity model preference

## Data engineering and preprocessing

In [25]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Ensure you have the NLTK stopwords downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/alexchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# Load the CSV files
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

In [27]:
# change date column to date type
batch1_labeled["Date"] = pd.to_datetime(batch1_labeled["Date"])

# create multi-index dataframe ordering timesreies data
batch1_labeled_multiindex = batch1_labeled.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_labeled_multiindex = batch1_labeled_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_labeled_multiindex.index = batch1_labeled_multiindex.index.droplevel(0)

batch1_labeled_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"3-D Marketing Technologies, INC",2003-07-03,16_2003-07-03_Certificates of Incorporation,Certificate of Incorporation,0,,1500,,1500
"3-D Marketing Technologies, INC",2004-01-22,16_2004-01-22_Certificates of Incorporation,Restated Certificate of Incorporation,0,,6250000,,6250000
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,1,A,20500000,9500000,30000000
"3-D Marketing Technologies, INC",2005-05-18,16_2005-05-18_Certificates of Incorporation,Certificate of Amendment,0,,,,0
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A,3333333,1333333,4666666
...,...,...,...,...,...,...,...,...
"The 41st Parameter, INC",2007-06-15,28_2007-06-15_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A=B=C,60000000,33952073,93952073
"The 41st Parameter, INC",2009-12-07,28_2009-12-07_Certificates of Incorporation,Certificate of Amendment,0,,66510000,40462073,106972073
"The 41st Parameter, INC",2009-12-17,28_2009-12-17_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A=B=C,81583601,45864172,127447773
"The 41st Parameter, INC",2012-03-16,28_2012-03-16_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A=B=C=D,100000000,57060272,157060272


In [28]:
# change date column to date type
batch1_lp["Date"] = pd.to_datetime(batch1_lp["Date"])

# create multi-index dataframe ordering timesreies data
batch1_lp_multiindex = batch1_lp.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_lp_multiindex = batch1_lp_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_lp_multiindex.index = batch1_lp_multiindex.index.droplevel(0)
batch1_lp_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Preferred Stock Type,Order of Priority,Liquidation Preference,Liquidation Multiple,Number of Preferred Stocks Issued,Original Issue Price
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,A,1,$0.431469,1,9500000,$0.431469
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$1.000000,1,1333333,$1.000000
"3Point5, INC",2005-10-20,24_2005-10-20_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,B,1,$0.600000,1,2600000,$0.600000
...,...,...,...,...,...,...,...,...,...
"The 41st Parameter, INC",2009-12-17,28_2009-12-17_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,C,1,$0.968300,1,22912099,$0.968300
"The 41st Parameter, INC",2012-03-16,28_2012-03-16_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.442990,1,9819635,$0.442990
"The 41st Parameter, INC",2012-03-16,28_2012-03-16_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,B,1,$0.852850,1,13132438,$0.852850
"The 41st Parameter, INC",2012-03-16,28_2012-03-16_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,C,1,$0.968300,1,16408199,$0.968300


## Part 1. Classify which text files contain liquidation preference or not

In [29]:
# Load the CSV file
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

# Load text files and associate them with labels
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
text_data = []
labels = []
document_names = []
pd.reset_option('^display.', silent=True)

In [30]:
# Read each file and extract data
for _, row in batch1_labeled.iterrows():
    file_name = row['File Name']
    label = row['Contains Liquidity Preference']
    file_path = os.path.join(txt_folder_path, file_name + ".txt")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            text_data.append(text)
            labels.append(label)
            document_names.append(file_name)  # Append the document name
    else:
        print(f"File not found: {file_path}")

In [31]:
# Split the data into training and test sets, while keeping track of the document names
X_train, X_test, y_train, y_test, train_docs, test_docs = train_test_split(
    text_data, labels, document_names, test_size=0.25, random_state=42
)

In [32]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [33]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train_tfidf, y_train)

In [34]:
# Predict the classes for the test set
y_pred = rf_model.predict(X_test_tfidf)
y_pred_prob = rf_model.predict_proba(X_test_tfidf)

In [35]:
# Create a DataFrame with predictions and confidence scores
predictions_df = pd.DataFrame({
    'Document': test_docs,
    'True Classification': y_test,
    'Predicted Classification': y_pred,
    'Probability of Containing Liquidation Preference Information': y_pred_prob[:, 1],
    'Probability of Not Containing Liquidation Preference Information': y_pred_prob[:, 0]
})
predictions_df

Unnamed: 0,Document,True Classification,Predicted Classification,Probability of Containing Liquidation Preference Information,Probability of Not Containing Liquidation Preference Information
0,28_2009-12-17_Certificates of Incorporation,1,1,1.0,0.0
1,24_2014-08-27_Certificates of Incorporation,1,1,1.0,0.0
2,48_2004-10-19_Certificates of Incorporation,1,1,1.0,0.0
3,49_2008-06-12_Certificates of Incorporation,1,1,1.0,0.0
4,16_2003-07-03_Certificates of Incorporation,0,0,0.06,0.94
5,27_2002-09-23_Certificates of Incorporation,0,0,0.1,0.9
6,28_2009-12-07_Certificates of Incorporation,0,0,0.07,0.93
7,48_2014-03-06_Certificates of Incorporation,0,0,0.42,0.58
8,16_2012-12-17_Certificates of Incorporation,0,0,0.14,0.86
9,34_2010-01-28_Certificates of Incorporation,0,0,0.06,0.94


In [36]:
# display summary of classification results
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



In [37]:
# Filter the predictions to only include documents with liquidation preference (Predicted Classification == 1)
predictions_with_lp = predictions_df[predictions_df['Predicted Classification'] == 1]

## Part 2. Remove all text except for areas that contain liquidation preference

In [51]:
# import other packages
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
import re

In [52]:
# Initialize the semantic model for sentence embeddings
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example phrases for liquidation preferences
example_phrases = [
    """In the event of any Liquidation Event (as defined below), either voluntary
    or involuntary, the holders of Preferred Stock shall be entitled to receive, prior and in preference
    to any distribution of the proceeds of such Liquidation Event (the “Proceeds”) to the holders of
    Common Stock by reason of their ownership thereof, an amount per share equal to the sum of the
    applicable Original Issue Price (as defined below) for such Preferred Stock, plus declared but
    unpaid dividends on such share.""",
    """Series A Original Purchase Price” shall be $0.40 per share of Series A Preferred Stock,
    subject to appropriate adjustment in the event of any stock dividend, stock split, combination or
    other similar recapitalization with respect to the Series A Preferred Stock.""",
    """Series B Original Purchase Price” shall be the purchase price at which each share of
    Series B Preferred Stock was originally issued, subject to appropriate adjustment in the event of
    any stock dividend, stock split, combination or other similar recapitalization with respect to the
    Series B Preferred Stock.""",
    """Series C Original Purchase Price” shall be $1.74417185 per share of Series C Preferred
    Stock, subject to appropriate adjustment in the event of any stock dividend, stock split,
    combination or other similar recapitalization with respect to the Series C Preferred Stock.""",
    """Series D Original Purchase Price” shall be $3.27374825 per share of Series D Preferred
    Stock, subject to appropriate adjustment in the event of any stock dividend, stock split,
    combination or other similar recapitalization with respect to the Series D Preferred Stock.""",
    """Liquidation Preference” shall mean $0.44299 per share for the Series A Preferred
    Stock, $0.85285 per share for the Serics B Preferred Stock and $0.9683 per share for the Series C Preferred
    Stock (in each case, subject to adjustment from time to time for Recapitalizations of the Preferred Stock as set
    forth elsewhere herein).""",
    """"Original Series B-3 Issue Price” shall mean
    $0.3524 per share for each share of the Series B-3 Preferred Stock (as adjusted for any dividends,
    zcombinations, splits, recapitalizations and the like after the filing date hereof).""",
    """"Original Series B-2 Issue Price” shall mean $0.3524 per share for each share of the Series B-2 Preferred Stock (as adjusted
    for any dividends, combinations, splits, recapitalizations and the like after the filing date hereof).""",
    """“Original Series B-1 Issue Price” shall mean
    $0.5496 per share for each share of the Series B-1 Preferred Stock (as adjusted for any dividends,
    combinations, splits, recapitalizations and the like after the filing date hereof).""",
    """"“Original Series A-1 Issue Price” shall
    mean $0.137146 per share for each share of the Series A-1 Preferred Stock (as adjusted for any
    dividends, combinations, splits, recapitalizations and the like after the filing date hereof).""",
    """the holders of shares of Senior Preferred Stock then outstanding shall be entitled to be paid out of the assets of the
    Corporation available for distribution to its stockholders, before any payment shall be made to the holders
    of Series B-1 Preferred Stock, Common Stock or any other class or series of stock ranking on liquidation
    junior to the Senior Preferred Stock by reason of their ownership thereof, an amount equal to $1.00 per
    share in the case of the Series A Preferred Stock, $1.50 per share in the case of the Series A-1 Preferred
    Stock, $2.078192 per share in the case of the Series B Preferred Stock, $3.371016 per share in the case of
    the Series C Preferred Stock, $6.56063 per share in the case of the Series D Preferred Stock and $16.5923
    per share in the case of Series E Preferred Stock (subject in each case to appropriate adjustment in the event
    of any stock dividend, stock split, combination or similar recapitalization affecting such shares), in each
    case plus any dividends declared but unpaid thereon (the amounts payable to holders of Series A Preferred
    Stock, Series A-1 Preferred Stock, Series B Preferred Stock, Series C Preferred Stock and Series E
    Preferred Stock pursuant to this sentence are hereinafter referred to as the "Series A Liquidation Amount,"
    the "Series A-1 Liquidation Amount, the "Series B Liquidation Amount," the "Series C Liquidation
    Amount," and the "Series E Liquidation Amount," respectively).""",
    """“Liquidation Preference” shall mean $0.60 per share for the Series A Preferred
    Stock, $1.40 per share for the Series B Preferred Stock and $2.00 per share for the Series C Preferred
    Stock (each subject to adjustment from time to time for Recapitalizations).""",
    """the holders of the Series A, Series B,Series C, Series D and Series E Preferred Stock shall be entitled to receive, prior and in
    preference to any distribution of any of the assets of the Corporation to the holders of Common
    Stock by reason of their ownership thereof, an amount per share equal to (i) $1.25 per share (as
    adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series A
    Preferred Stock held by them at the closing of such transaction, plus all declared but unpaid
    dividends payable on the Series A Preferred Stock, (il) $1.847 per share (as adjusted for stock
    splits, stock dividends, reclassification and the like) for each share of Series B Preferred Stock
    held by them at the closing of such transaction, plus all declared but unpaid dividends payable on
    the Series B Preferred Stock, (iii) $2.38 per share (as adjusted for stock splits, stock dividends,
    reclassification and the like) for each share of Series C Preferred Stock held by them at the
    closing of such transaction, plus all declared but unpaid dividends payable on the Series C
    Preferred Stock, (iv) $3.547 per share (as adjusted for stock splits, stock dividends,
    reclassification and the like) for each share of Series D Preferred Stock held by them at the
    closing of such transaction, plus all declared but unpaid dividends payable on the Series D
    Preferred Stock and (v) $5.10 per share (as adjusted for stock splits, stock dividends,
    reclassification and the like) for each share of Series E Preferred Stock held by them at the
    closing of such transaction, plus all declared but unpaid dividends payable on the Series E
    Preferred Stock.""",
    

    "$0.624136 per share for each share of the Series B Preferred Stock",
    "$0.47455 per share for each share of the Series C Preferred Stock",
    "$0.60 per share for the Series A Preferred Stock", 
    "$1.40 per share for the Series B Preferred Stock",
    "$2.00 per share for the Series C Preferred Stock",
    "$3.14 per share of Series F Preferred Stock",
    "$1.00 per share in the case of the Series A Preferred Stock",
    "$1.50 per share in the case of the Series A-1 Preferred Stock",
    "$2.078192 per share in the case of the Series B Preferred Stock",
    "$3.33 per share in the case of the Series B-1 Preferred Stock",
    "$3.371016 per share in the case of the Series C Preferred Stock",
    "$6.56063 per share in the case of the Series D Preferred Stock",
    "1.25 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series A Preferred Stock",
    "$1.847 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series B Preferred Stock",
    "$2.38 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series C Preferred Stock",
    "$3.547 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series D Preferred Stock",
    "$5.10 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series E Preferred Stock",
    "Liquidation Preference shall mean $0.40 per share for the Series A Preferred Stock and $0.60 per share for the Series B Preferred Stock (subject to adjustment from time to time for Recapitalizations as set forth elsewhere herein).",
    "Liquidation Preference shall mean $0.57 per share for the Series A Preferred Stock (as appropriately adjusted for any Recapitalization and as otherwise set forth elsewhere herein)."
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases, convert_to_tensor=True)

In [53]:
# Regex patterns for extracting monetary values and stock types
money_pattern = r"\$\d+(?:\.\d+)?"
stock_pattern = r"Series\s[A-Z0-9\-]+(?:\s[A-Z0-9\-]+)*\sPreferred Stock"

In [54]:
# Function to extract relevant financial entities
def extract_entities(sentence):
    money_entities = [float(match.group(0).replace('$', '')) for match in re.finditer(money_pattern, sentence)]
    stock_entities = [match.group(0) for match in re.finditer(stock_pattern, sentence)]
    return money_entities, stock_entities

In [76]:
# Function to process a single document
def process_document(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        doc_text = file.read()
    
    # Preprocess text: replace newlines with spaces
    doc_text = doc_text.replace("\n", " ")
    
    sentences = sent_tokenize(doc_text)  # Improved sentence segmentation
    results = []

    for sentence in sentences:
        sentence_embedding = semantic_model.encode(sentence, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(sentence_embedding, example_embeddings)
        max_score = cosine_scores.max().item()
        
        money_entities, stock_entities = extract_entities(sentence)
        if money_entities and stock_entities:
            results.append({
                'Document': os.path.basename(file_path),
                'Sentence': sentence.strip(),
                'Liquidation Value': money_entities,
                'Stock Type': stock_entities,
                'Similarity Score': max_score
            })
    
    return results

In [88]:
import networkx as nx

def extract_liquidation_order(sentences, stock_types):
    preference_graph = nx.DiGraph()
    
    # Initialize nodes for all stock types
    for stock in stock_types:
        preference_graph.add_node(stock)
    
    # Define priority phrases
    priority_phrases = ["prior and in preference to", "before any payment shall be made to", "only after distributions to"]
    
    # Extract priority relationships
    for sentence in sentences:
        for phrase in priority_phrases:
            if phrase in sentence:
                mentioned_stocks = [stock for stock in stock_types if stock in sentence]
                if len(mentioned_stocks) >= 2:
                    priority_stock, junior_stock = mentioned_stocks[:2]  # Assume first stock is senior
                    preference_graph.add_edge(priority_stock, junior_stock)
    
    # Topological sorting and grouping by levels
    if nx.is_directed_acyclic_graph(preference_graph):
        levels = []
        while preference_graph.nodes:
            current_level = [node for node in preference_graph.nodes if preference_graph.in_degree(node) == 0]
            if not current_level:
                break  # Prevent infinite loops if there's an issue
            
            levels.append(current_level)
            preference_graph.remove_nodes_from(current_level)
    else:
        levels = [stock_types]  # No clear order found, return all stocks in one level
    
    return levels

def format_liquidation_order(levels):
    """ Convert list of stock levels into a readable string representation. """
    return " > ".join([f"({', '.join(level)})" if len(level) > 1 else level[0] for level in levels])

In [89]:
# Function to process all documents sequentially (naive approach)
def process_all_documents(predictions_with_lp, txt_folder_path):
    all_results = []

    for _, row in predictions_with_lp.iterrows():
        file_path = os.path.join(txt_folder_path, row['Document'] + ".txt")
        if os.path.exists(file_path):
            results = process_document(file_path)
            all_results.extend(results)

    # Convert to DataFrame
    relevant_df = pd.DataFrame(all_results)

    # Group by Document and aggregate results
    relevant_df_grouped = relevant_df.groupby('Document').agg(
        {'Sentence': list,
         'Liquidation Value': lambda x: [item for sublist in x for item in sublist],  # Flatten lists
         'Stock Type': lambda x: list(set(item for sublist in x for item in sublist)),  # Unique stock types
         'Similarity Score': list}).reset_index()
    
    # Compute liquidation preference order
    relevant_df_grouped['Liquidation Preference Order'] = relevant_df_grouped.apply(
        lambda row: format_liquidation_order(extract_liquidation_order(row['Sentence'], row['Stock Type'])), axis=1
    )

    return relevant_df_grouped

In [90]:
# run on documents containing liquidation preference 
relevant_df_grouped = process_all_documents(predictions_with_lp, txt_folder_path)
relevant_df_grouped

Unnamed: 0,Document,Sentence,Liquidation Value,Stock Type,Similarity Score,Liquidation Preference Order
0,16_2006-03-09_Certificates of Incorporation.txt,[The total number of shares of preferred stock...,"[0.0001, 0.0345, 0.0499, 0.431469, 0.624136]","[Series A Preferred Stock, Series B Preferred ...","[0.7458530068397522, 0.6745826601982117, 0.787...","(Series A Preferred Stock, Series B Preferred ..."
1,21_2006-04-21_Certificates of Incorporation.txt,[The total number of shares of Preferred Stock...,"[0.01, 0.08, 1.0, 1.0, 1.0, 4.0, 25.0, 500.0]",[Series A Preferred Stock],"[0.7874149084091187, 0.8245527744293213, 0.817...",Series A Preferred Stock
2,24_2014-08-27_Certificates of Incorporation.txt,[FOURTH: The total number of shares of all cla...,"[0.001, 0.001, 0.4, 1.74417185, 3.27374825]","[Series A Preferred Stock, Series B Preferred ...","[0.7669166326522827, 0.9969063997268677, 0.998...","(Series A Preferred Stock, Series B Preferred ..."
3,28_2009-12-17_Certificates of Incorporation.txt,[ARTICLE IV The total number of shares of sto...,"[0.001, 0.001, 0.44299, 0.85285, 0.9683, 0.035...","[Series A Preferred Stock, Series B Preferred ...","[0.6840397715568542, 0.7750880122184753, 0.719...","(Series A Preferred Stock, Series B Preferred ..."
4,35_2018-02-23_Certificates of Incorporation.txt,[“Original Series B-3 Issue Price” shall mean ...,"[0.3524, 0.3524, 0.5496, 0.137146, 1.3068]","[Series B-2 Preferred Stock, Series A-2 Prefer...","[0.997136116027832, 0.9991233944892883, 1.0000...","(Series B-2 Preferred Stock, Series A-2 Prefer..."
5,49_2008-05-06_Certificates of Incorporation.txt,"[(b) The Corporation shall not declare, pay or...","[1.0, 1.5, 2.078192, 3.33, 3.371016, 6.56063, ...","[Series E Preferred Stock, Series C Preferred ...","[0.8470276594161987, 0.8398562669754028, 0.958...","(Series E Preferred Stock, Series C Preferred ..."
6,49_2008-06-12_Certificates of Incorporation.txt,"[(b) The Corporation shall not declare, pay or...","[1.0, 1.5, 2.078192, 3.33, 3.371016, 6.56063, ...","[Series E Preferred Stock, Series C Preferred ...","[0.8494455814361572, 0.8776410818099976, 0.830...","(Series E Preferred Stock, Series D Preferred ..."
7,63_2007-05-24_Certificates of Incorporation.txt,"[For purposes of this ARTICLE V, the following...","[0.6, 1.4, 2.0, 0.048, 0.112, 0.16, 0.6, 1.4, ...","[Series A Preferred Stock, Series B Preferred ...","[0.8038737773895264, 0.7286333441734314, 0.988...","(Series A Preferred Stock, Series B Preferred ..."
8,92_2007-12-20_Certificates of Incorporation.txt,"[entitling the holder thereof to receive, dire...","[0.1, 0.14776, 0.1904, 0.28376, 0.408, 1.25, 1...","[Series E Preferred Stock, Series C Preferred ...","[0.8277464509010315, 0.9435732960700989, 0.782...","(Series E Preferred Stock, Series D Preferred ..."


In [91]:
relevant_df_grouped['Sentence'][0]

['The total number of shares of preferred stock authorized to be issued is 14,270,662, par value $0.0001 per share (the “Preferred Stock”), of which 9,270,662 shares are designated as “Series A Preferred Stock” and 5,000,000 shares are designated as “Series B Preferred Stock.”  B.',
 'For purposes of this subsection 1(a), “Dividend Rate” shall mean $0.0345 per annum for each share of Series A Preferred Stock and $0.0499 per annum for each share of Series B Preferred Stock (each as adjusted for any stock splits, stock dividends, combinations, subdivisions, recapitalizations or the like).',
 'For purposes of this Restated Certificate of Incorporation, “Original Issue Price” shall mean $0.431469 per share for each share of the Series A Preferred Stock and $0.624136 per share for each share of the Series B Preferred Stock (each as adjusted for any  GDS VF&H\\668819.2 2 stock splits, stock dividends, combinations, subdivisions, recapitalizations or the like with respect to such Preferred St

Approach: 
1. Scrape all company names, title of certificate, and date
2. For each company, if the latest title of certificate is not "Certificate of Incorporation", "Restated Certificate of Incorporation", or "Amended and Restated Certificate of Incorporation", then find the latest certificate that is and save both the latest file and the closes one that meets previous criteria; else save the latest certificate
3. Find the liquidation preferences for each document pertaining to each company based on most recent COI and supplemental documents if necessary

In [96]:
relevant_df_grouped['Liquidation Preference Order'][4]

'(Series B-2 Preferred Stock, Series A-2 Preferred Stock, Series B-3 Preferred Stock, Series A-1 Preferred Stock, Series B-1 Preferred Stock) > Series A- 2 Preferred Stock'