# liquidity model preference

## Data engineering and preprocessing

In [None]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk

# Ensure you have the NLTK stopwords downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
# Load the CSV files
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

In [24]:
# Show high-level dataframe and detailed datafarme
batch1_labeled
# batch1_lp

Unnamed: 0,File Name,Company Name,Document Type,Date,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
0,16_2003-07-03_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Incorporation,2003-07-03,0,,1500,,1500
1,16_2004-01-22_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-01-22,0,,6250000,,6250000
2,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,1,A,20500000,9500000,30000000
3,16_2005-05-18_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Amendment,2005-05-18,0,,,,0
4,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,1,A=B,25000000,14270662,39270662
...,...,...,...,...,...,...,...,...,...
85,92_2004-11-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2004-11-23,1,A=B=C,19375000,11839309,31214309
86,92_2007-12-20_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2007-12-20,1,A=B=C=D=E,42000000,28443627,70443627
87,92_2010-02-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2010-02-23,1,A=B=C=D=E,49000000,32325882,81325882
88,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,1,A=B=C,60000000,38416115,98416115


In [25]:
batch1_labeled['Document Type'].unique()


array(['Certificate of Incorporation',
       'Restated Certificate of Incorporation',
       'Certificate of Amendment', 'Certificate of Merger',
       'Certificate of Conversion', 'Certificate of Cancellation',
       'Amended and Restated Certificate of Incorporation',
       'Articles of Incorporation',
       'Amended and Restated Articles of Incorporation',
       'Certificate of Correction'], dtype=object)

In [26]:
# change date column to date type
batch1_labeled["Date"] = pd.to_datetime(batch1_labeled["Date"])

# create multi-index dataframe ordering timesreies data
batch1_labeled_multiindex = batch1_labeled.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_labeled_multiindex = batch1_labeled_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_labeled_multiindex.index = batch1_labeled_multiindex.index.droplevel(0)

In [27]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.reset_option('^display.', silent=True)

batch1_labeled_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"3-D Marketing Technologies, INC",2003-07-03,16_2003-07-03_Certificates of Incorporation,Certificate of Incorporation,0,,1500.0,,1500
"3-D Marketing Technologies, INC",2004-01-22,16_2004-01-22_Certificates of Incorporation,Restated Certificate of Incorporation,0,,6250000.0,,6250000
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,1,A,20500000.0,9500000.0,30000000
"3-D Marketing Technologies, INC",2005-05-18,16_2005-05-18_Certificates of Incorporation,Certificate of Amendment,0,,,,0
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A,3333333.0,1333333.0,4666666
"3Point5, INC",2004-12-01,24_2004-12-01_Certificates of Incorporation,Certificate of Incorporation,0,,5000000.0,,5000000
"3Point5, INC",2005-10-20,24_2005-10-20_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A,7000000.0,2662500.0,9662500
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A=B,9100000.0,5262500.0,14362500
"3Point5, INC",2007-06-28,24_2007-06-28_Certificates of Incorporation,Certificate of Amendment,0,,15000000.0,7762500.0,22762500
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,1,A=B,17000000.0,9380000.0,26380000


In [28]:
# Create a dictionary where each company's data is a separate DataFrame
company_timeseries_dict = {
    company: df.sort_values("Date").reset_index(drop=True)  # Sort each company's data by date
    for company, df in batch1_labeled.groupby("Company Name", sort=False)
}

In [29]:
# change date column to date type
batch1_lp["Date"] = pd.to_datetime(batch1_lp["Date"])

# create multi-index dataframe ordering timesreies data
batch1_lp_multiindex = batch1_lp.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_lp_multiindex = batch1_lp_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_lp_multiindex.index = batch1_lp_multiindex.index.droplevel(0)
batch1_lp_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Preferred Stock Type,Order of Priority,Liquidation Preference,Liquidation Multiple,Number of Preferred Stocks Issued,Original Issue Price
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,A,1,$0.431469,1,9500000.0,$0.431469
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$1.000000,1,1333333.0,$1.000000
"3Point5, INC",2005-10-20,24_2005-10-20_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500.0,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500.0,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,B,1,$0.600000,1,2600000.0,$0.600000
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,A,1,$0.400000,1,2562500.0,$0.400000
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,B,1,,1,6717000.0,
"3VR Security, INC",2005-12-22,27_2005-12-22_Certificates of Incorporation,Amended and Restated Articles of Incorporation,A,1,$0.570000,1,9525000.0,$0.570000
"3VR Security, INC",2005-12-22,27_2005-12-22_Certificates of Incorporation,Amended and Restated Articles of Incorporation,B,1,$1.090000,1,9500000.0,$1.090000
"3VR Security, INC",2006-08-30,27_2006-08-30_Certificates of Incorporation,Certificate of Amendment,A,1,$0.570000,1,9525000.0,$0.570000


## Part 1. Classify which text files contain liquidation preference or not

In [30]:
# Load the CSV file
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

# Load text files and associate them with labels
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
text_data = []
labels = []
document_names = []
pd.reset_option('^display.', silent=True)


In [31]:
# display labels of batch1 text files
batch1_labeled

Unnamed: 0,File Name,Company Name,Document Type,Date,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
0,16_2003-07-03_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Incorporation,2003-07-03,0,,1500,,1500
1,16_2004-01-22_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-01-22,0,,6250000,,6250000
2,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,1,A,20500000,9500000,30000000
3,16_2005-05-18_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Amendment,2005-05-18,0,,,,0
4,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,1,A=B,25000000,14270662,39270662
...,...,...,...,...,...,...,...,...,...
85,92_2004-11-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2004-11-23,1,A=B=C,19375000,11839309,31214309
86,92_2007-12-20_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2007-12-20,1,A=B=C=D=E,42000000,28443627,70443627
87,92_2010-02-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2010-02-23,1,A=B=C=D=E,49000000,32325882,81325882
88,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,1,A=B=C,60000000,38416115,98416115


In [32]:
# display details of batch1 preferred stocks
batch1_lp

Unnamed: 0,File Name,Company Name,Document Type,Date,Preferred Stock Type,Order of Priority,Liquidation Preference,Liquidation Multiple,Number of Preferred Stocks Issued,Original Issue Price
0,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,A,1,$0.431469,1,9500000,$0.431469
1,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,A,1,$0.431469,1,9270662,$0.431469
2,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,B,1,$0.624136,1,5000000,$0.624136
3,16_2007-05-16_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2007-05-16,A,2,$0.431469,1,9270662,$0.431469
4,16_2007-05-16_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2007-05-16,B,1,$0.624136,1,3204429,$0.624136
...,...,...,...,...,...,...,...,...,...,...
198,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,C,1,$3.218000,1,11000000,
199,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,A,2,$1.040000,1,14071484,
200,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,B,2,$2.100000,1,13357773,
201,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,C,2,$3.218000,1,10875321,


In [33]:
# Read each file and extract data
for _, row in batch1_labeled.iterrows():
    file_name = row['File Name']
    label = row['Contains Liquidity Preference']
    file_path = os.path.join(txt_folder_path, file_name + ".txt")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            text_data.append(text)
            labels.append(label)
            document_names.append(file_name)  # Append the document name
    else:
        print(f"File not found: {file_path}")

In [34]:
# Split the data into training and test sets, while keeping track of the document names
X_train, X_test, y_train, y_test, train_docs, test_docs = train_test_split(
    text_data, labels, document_names, test_size=0.25, random_state=42
)

In [35]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train_tfidf, y_train)

In [37]:
# Predict the classes for the test set
y_pred = rf_model.predict(X_test_tfidf)
y_pred_prob = rf_model.predict_proba(X_test_tfidf)

In [38]:
# Create a DataFrame with predictions and confidence scores
predictions_df = pd.DataFrame({
    'Document': test_docs,
    'True Classification': y_test,
    'Predicted Classification': y_pred,
    'Probability of Containing Liquidation Preference Information': y_pred_prob[:, 1],
    'Probability of Not Containing Liquidation Preference Information': y_pred_prob[:, 0]
})
predictions_df

Unnamed: 0,Document,True Classification,Predicted Classification,Probability of Containing Liquidation Preference Information,Probability of Not Containing Liquidation Preference Information
0,28_2009-12-17_Certificates of Incorporation,1,1,1.0,0.0
1,24_2014-08-27_Certificates of Incorporation,1,1,1.0,0.0
2,48_2004-10-19_Certificates of Incorporation,1,1,1.0,0.0
3,49_2008-06-12_Certificates of Incorporation,1,1,1.0,0.0
4,16_2003-07-03_Certificates of Incorporation,0,0,0.06,0.94
5,27_2002-09-23_Certificates of Incorporation,0,0,0.1,0.9
6,28_2009-12-07_Certificates of Incorporation,0,0,0.07,0.93
7,48_2014-03-06_Certificates of Incorporation,0,0,0.42,0.58
8,16_2012-12-17_Certificates of Incorporation,0,0,0.14,0.86
9,34_2010-01-28_Certificates of Incorporation,0,0,0.06,0.94


In [39]:
# display summary of classification results
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



## Part 2. Remove all text except for areas that contain liquidation preference

In [40]:
# import other packages
from sentence_transformers import SentenceTransformer, util
import re



In [41]:
# Initialize the semantic model for sentence embeddings
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example phrases for liquidation preferences
example_phrases = [
    "$0.624136 per share for each share of the Series B Preferred Stock ",
    "$0.47455 per share for each share of the Series C Preferred Stock",
    "$0.60 per share for the Series A Preferred Stock", 
    "$1.40 per share for the Series B Preferred Stock",
    "$2.00 per share for the Series C Preferred Stock",
    "$3.14 per share of Series F Preferred Stock",
    "$1.00 per share in the case of the Series A Preferred Stock",
    "$1.50 per share in the case of the Series A-1 Preferred Stock",
    "$2.078192 per share in the case of the Series B Preferred Stock",
    "$3.33 per share in the case of the Series B-1 Preferred Stock",
    "$3.371016 per share in the case of the Series C Preferred Stock",
    "$6.56063 per share in the case of the Series D Preferred Stock",
    "1.25 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series A Preferred Stock",
    "$1.847 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series B Preferred Stock",
    "$2.38 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series C Preferred Stock",
    "$3.547 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series D Preferred Stock",
    "$5.10 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series E Preferred Stock",
    "Liquidation Preference shall mean $0.40 per share for the Series A Preferred Stock and $0.60 per share for the Series B Preferred Stock (subject to adjustment from time to time for Recapitalizations as set forth elsewhere herein).",
    "Liquidation Preference shal} mean $0.57 per share for the Series A Preferred Stock (as appropriately adjusted for any Recapitalization and as otherwise set forth elsewhere herein)."
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases, convert_to_tensor=True)

In [42]:
# Function to extract money and stock entities using regex
def extract_entities(sentence):
    # Regex patterns to detect money values and stock names
    money_pattern = r"\$\d+(\.\d+)?"
    stock_pattern = r"Series\s[A-Z0-9\-]+(?:\s[A-Z0-9\-]+)*\sPreferred Stock"
    
    money_entities = [float(match.group(0).replace('$', '')) for match in re.finditer(money_pattern, sentence)]
    stock_entities = [match.group(0) for match in re.finditer(stock_pattern, sentence)]
    
    return money_entities, stock_entities

In [43]:
# Define the function to extract relevant chunk and score based on similarity
def extract_relevant_chunk_and_score(doc_text, example_embeddings, threshold=0.6):
    sentences = doc_text.split(".")  # Split document into sentences
    results = []  # List to store sentences with relevant info

    for sentence in sentences:
        sentence_embedding = semantic_model.encode(sentence, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(sentence_embedding, example_embeddings)
        
        # Extract the maximum similarity score for the sentence
        max_score = cosine_scores.max().item()
        
        # If similarity score is above the threshold, extract relevant entities
        if max_score > threshold:
            money_entities, stock_entities = extract_entities(sentence)
            
            if money_entities and stock_entities:  # Only store if there are relevant entities
                results.append({
                    'Sentence': sentence.strip(),
                    'Liquidation Value': money_entities,
                    'Stock Type': stock_entities,
                    'Similarity Score': max_score
                })
    
    return results

In [44]:
txt_folder_path = '/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable'

# Create a DataFrame to store the results
all_results = []

# Loop over each document in predictions_df
for _, row in predictions_df.iterrows():
    if row['Predicted Classification'] == 1:  # If document predicted to contain liquidation preference info
        file_name = row['Document']
        
        # Retrieve the corresponding text from the file
        file_path = os.path.join(txt_folder_path, file_name + ".txt")
        with open(file_path, "r", encoding="utf-8") as file:
            doc_text = file.read()
        
        # Extract all relevant sentences and their scores
        relevant_results = extract_relevant_chunk_and_score(doc_text, example_embeddings)
        
        # For each relevant result, append to the all_results list
        for result in relevant_results:
            result['Document'] = file_name  # Add document name to each result
            all_results.append(result)

# Convert results to DataFrame
relevant_df = pd.DataFrame(all_results)

# Group by Document and aggregate the results under the same document
relevant_df_grouped = relevant_df.groupby('Document').agg(
    {'Sentence': lambda x: list(x),
     'Liquidation Value': lambda x: [item for sublist in x for item in sublist],  # Flattening lists of liquidation values
     'Stock Type': lambda x: [item for sublist in x for item in sublist],  # Flattening lists of stock types
     'Similarity Score': lambda x: list(x)}).reset_index()

In [45]:
pd.set_option('display.max_rows', None)
relevant_df_grouped

Unnamed: 0,Document,Sentence,Liquidation Value,Stock Type,Similarity Score
0,16_2006-03-09_Certificates of Incorporation,[0345 per annum for each share of Series A Pre...,"[0.0, 0.0]","[Series A Preferred Stock, Series A Preferred ...","[0.8080525398254395, 0.8664486408233643]"
1,21_2006-04-21_Certificates of Incorporation,[Liquidation Preference\n\n(a) In the event of...,"[1.0, 1.0, 1.0, 4.0, 25.0]","[Series A Preferred Stock, Series A Preferred ...","[0.7676219940185547, 0.8134998083114624, 0.810..."
2,28_2009-12-17_Certificates of Incorporation,[44299 per share for the Series A Preferred St...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[Series A Preferred Stock, Series B Preferred ...","[0.9339544773101807, 0.919361412525177, 0.8691..."
3,35_2018-02-23_Certificates of Incorporation,[(e) Upon the completion of the distributions ...,[1.0],[Series A-2 Preferred Stock],[0.6359134912490845]
4,49_2008-05-06_Certificates of Incorporation,[00 per share in the case of the Series A Pref...,"[1.0, 2.0, 3.0, 3.0, 6.0, 16.0, 1.0, 1.0, 6.0,...","[Series A Preferred Stock, Series A-1 Preferre...","[0.9411284923553467, 0.9153479337692261, 0.935..."
5,49_2008-06-12_Certificates of Incorporation,[00 per share in the case of the Series A Pref...,"[1.0, 2.0, 3.0, 3.0, 16.0, 416.0, 1.0, 1.0, 3....","[Series A Preferred Stock, Series A-1 Preferre...","[0.9411284923553467, 0.9153479337692261, 0.935..."
6,63_2007-05-24_Certificates of Incorporation,[60 per share for the Series A Preferred Stock...,"[1.0, 2.0, 0.0, 0.0, 2.0, 2.0]","[Series A Preferred Stock, Series B Preferred ...","[0.9351432919502258, 0.9134662747383118, 0.901..."
7,92_2007-12-20_Certificates of Incorporation,"[10 per share\n(as adjusted for stock splits, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 3.0, 5.0, ...","[Series A Preferred Stock, Series B Preferred ...","[0.8445618152618408, 0.8855015635490417, 0.788..."


In [46]:
relevant_df_grouped['Sentence'][5]

['00 per share in the case of the Series A Preferred Stock, $1',
 '50 per share in the case of the\nSeries A-1 Preferred Stock, $2',
 '078192 per share in the case of the Series B Preferred Stock, $3',
 '33 per share\nin the case of the Series B-1 Preferred Stock, $3',
 '56063 per share in the case of the Series D Preferred Stock and $16',
 '(a) In the event of any voluntary or involuntary liquidation, dissolution or winding up of the\nCorporation (a "Liquidation Event") in which the amounts available for distribution to stockholders (the\n"Liquidation Proceeds") are less than or equal to $416,500,000 (the "Liquidation Trigger"), the holders of\nshares of Senior Preferred Stock then outstanding shall be entitled to be paid out of the assets of the\nCorporation available for distribution to its stockholders, before any payment shall be made to the holders\nof Series B-1 Preferred Stock, Common Stock or any other class or series of stock ranking on liquidation\njunior to the Senior Prefe

- amendments: supplemental to original charter
- restatements: new version of charter with the amendment in it
- consider organization of data now because difference between amendments and restatements
- reorganize data now to consider and adjust for unique commpanies/amendments and restatements

Approach: 
1. Scrape all company names, title of certificate, and date
2. For each company, if the latest title of certificate is not "Certificate of Incorporation", "Restated Certificate of Incorporation", or "Amended and Restated Certificate of Incorporation", then find the latest certificate that is and save both the latest file and the closes one that meets previous criteria; else save the latest certificate
3. Find the liquidation preferences for each document pertaining to each company based on most recent COI and supplemental documents if necessary

Questions:
- What is the ultimate metric we want to capture? (liquidation preference for each unique company?)
- How and where are we storing all this data? 

In [47]:
relevant_df_grouped['Similarity Score'][0]

[0.8080525398254395, 0.8664486408233643]