# liquidity model preference

## Data engineering and preprocessing

In [1]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Ensure you have the NLTK stopwords downloaded
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the CSV files
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

In [6]:
# Show high-level dataframe and detailed datafarme
batch1_labeled
# batch1_lp

Unnamed: 0,File Name,Company Name,Document Type,Date,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
0,16_2003-07-03_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Incorporation,2003-07-03,0,,1500,,1500
1,16_2004-01-22_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-01-22,0,,6250000,,6250000
2,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,1,A,20500000,9500000,30000000
3,16_2005-05-18_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Amendment,2005-05-18,0,,,,0
4,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,1,A=B,25000000,14270662,39270662
...,...,...,...,...,...,...,...,...,...
85,92_2004-11-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2004-11-23,1,A=B=C,19375000,11839309,31214309
86,92_2007-12-20_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2007-12-20,1,A=B=C=D=E,42000000,28443627,70443627
87,92_2010-02-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2010-02-23,1,A=B=C=D=E,49000000,32325882,81325882
88,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,1,A=B=C,60000000,38416115,98416115


In [4]:
batch1_labeled['Document Type'].unique()


array(['Certificate of Incorporation',
       'Restated Certificate of Incorporation',
       'Certificate of Amendment', 'Certificate of Merger',
       'Certificate of Conversion', 'Certificate of Cancellation',
       'Amended and Restated Certificate of Incorporation',
       'Articles of Incorporation',
       'Amended and Restated Articles of Incorporation',
       'Certificate of Correction'], dtype=object)

In [7]:
# change date column to date type
batch1_labeled["Date"] = pd.to_datetime(batch1_labeled["Date"])

# create multi-index dataframe ordering timesreies data
batch1_labeled_multiindex = batch1_labeled.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_labeled_multiindex = batch1_labeled_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_labeled_multiindex.index = batch1_labeled_multiindex.index.droplevel(0)

In [8]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.reset_option('^display.', silent=True)

batch1_labeled_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"3-D Marketing Technologies, INC",2003-07-03,16_2003-07-03_Certificates of Incorporation,Certificate of Incorporation,0,,1500.0,,1500
"3-D Marketing Technologies, INC",2004-01-22,16_2004-01-22_Certificates of Incorporation,Restated Certificate of Incorporation,0,,6250000.0,,6250000
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,1,A,20500000.0,9500000.0,30000000
"3-D Marketing Technologies, INC",2005-05-18,16_2005-05-18_Certificates of Incorporation,Certificate of Amendment,0,,,,0
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A,3333333.0,1333333.0,4666666
"3Point5, INC",2004-12-01,24_2004-12-01_Certificates of Incorporation,Certificate of Incorporation,0,,5000000.0,,5000000
"3Point5, INC",2005-10-20,24_2005-10-20_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A,7000000.0,2662500.0,9662500
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,1,A=B,9100000.0,5262500.0,14362500
"3Point5, INC",2007-06-28,24_2007-06-28_Certificates of Incorporation,Certificate of Amendment,0,,15000000.0,7762500.0,22762500
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,1,A=B,17000000.0,9380000.0,26380000


In [9]:
# Create a dictionary where each company's data is a separate DataFrame
company_timeseries_dict = {
    company: df.sort_values("Date").reset_index(drop=True)  # Sort each company's data by date
    for company, df in batch1_labeled.groupby("Company Name", sort=False)
}

In [10]:
# change date column to date type
batch1_lp["Date"] = pd.to_datetime(batch1_lp["Date"])

# create multi-index dataframe ordering timesreies data
batch1_lp_multiindex = batch1_lp.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
batch1_lp_multiindex = batch1_lp_multiindex.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
batch1_lp_multiindex.index = batch1_lp_multiindex.index.droplevel(0)
batch1_lp_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type,Preferred Stock Type,Order of Priority,Liquidation Preference,Liquidation Multiple,Number of Preferred Stocks Issued,Original Issue Price
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"3-D Marketing Technologies, INC",2004-07-14,16_2004-07-14_Certificates of Incorporation,Restated Certificate of Incorporation,A,1,$0.431469,1,9500000.0,$0.431469
"3Jam, INC",2006-04-21,21_2006-04-21_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$1.000000,1,1333333.0,$1.000000
"3Point5, INC",2005-10-20,24_2005-10-20_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500.0,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,A,1,$0.400000,1,2662500.0,$0.400000
"3Point5, INC",2006-09-14,24_2006-09-14_Certificates of Incorporation,Amended and Restated Certificate of Incorporation,B,1,$0.600000,1,2600000.0,$0.600000
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,A,1,$0.400000,1,2562500.0,$0.400000
"3Point5, INC",2008-08-21,24_2008-08-21_Certificates of Incorporation,Certificate of Amendment,B,1,,1,6717000.0,
"3VR Security, INC",2005-12-22,27_2005-12-22_Certificates of Incorporation,Amended and Restated Articles of Incorporation,A,1,$0.570000,1,9525000.0,$0.570000
"3VR Security, INC",2005-12-22,27_2005-12-22_Certificates of Incorporation,Amended and Restated Articles of Incorporation,B,1,$1.090000,1,9500000.0,$1.090000
"3VR Security, INC",2006-08-30,27_2006-08-30_Certificates of Incorporation,Certificate of Amendment,A,1,$0.570000,1,9525000.0,$0.570000


## Part 1. Classify which text files contain liquidation preference or not

In [11]:
# Load the CSV file
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_labeled = pd.read_csv(batch1_labeled_path)

batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
batch1_lp = pd.read_csv(batch1_lp_path)

# Load text files and associate them with labels
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
text_data = []
labels = []
document_names = []
pd.reset_option('^display.', silent=True)


In [12]:
# display labels of batch1 text files
batch1_labeled

Unnamed: 0,File Name,Company Name,Document Type,Date,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued
0,16_2003-07-03_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Incorporation,2003-07-03,0,,1500,,1500
1,16_2004-01-22_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-01-22,0,,6250000,,6250000
2,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,1,A,20500000,9500000,30000000
3,16_2005-05-18_Certificates of Incorporation,"3-D Marketing Technologies, INC",Certificate of Amendment,2005-05-18,0,,,,0
4,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,1,A=B,25000000,14270662,39270662
...,...,...,...,...,...,...,...,...,...
85,92_2004-11-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2004-11-23,1,A=B=C,19375000,11839309,31214309
86,92_2007-12-20_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2007-12-20,1,A=B=C=D=E,42000000,28443627,70443627
87,92_2010-02-23_Certificates of Incorporation,"Access Closure, INC",Amended and Restated Certificate of Incorporation,2010-02-23,1,A=B=C=D=E,49000000,32325882,81325882
88,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,1,A=B=C,60000000,38416115,98416115


In [13]:
# display details of batch1 preferred stocks
batch1_lp

Unnamed: 0,File Name,Company Name,Document Type,Date,Preferred Stock Type,Order of Priority,Liquidation Preference,Liquidation Multiple,Number of Preferred Stocks Issued,Original Issue Price
0,16_2004-07-14_Certificates of Incorporation,"3-D Marketing Technologies, INC",Restated Certificate of Incorporation,2004-07-14,A,1,$0.431469,1,9500000,$0.431469
1,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,A,1,$0.431469,1,9270662,$0.431469
2,16_2006-03-09_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2006-03-09,B,1,$0.624136,1,5000000,$0.624136
3,16_2007-05-16_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2007-05-16,A,2,$0.431469,1,9270662,$0.431469
4,16_2007-05-16_Certificates of Incorporation,"M-Factor, INC",Restated Certificate of Incorporation,2007-05-16,B,1,$0.624136,1,3204429,$0.624136
...,...,...,...,...,...,...,...,...,...,...
198,100_2007-02-22_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2007-02-22,C,1,$3.218000,1,11000000,
199,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,A,2,$1.040000,1,14071484,
200,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,B,2,$2.100000,1,13357773,
201,100_2008-12-03_Certificates of Incorporation,"Acclarent, INC",Amended and Restated Certificate of Incorporation,2008-12-03,C,2,$3.218000,1,10875321,


In [14]:
# Read each file and extract data
for _, row in batch1_labeled.iterrows():
    file_name = row['File Name']
    label = row['Contains Liquidity Preference']
    file_path = os.path.join(txt_folder_path, file_name + ".txt")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            text_data.append(text)
            labels.append(label)
            document_names.append(file_name)  # Append the document name
    else:
        print(f"File not found: {file_path}")

In [15]:
# Split the data into training and test sets, while keeping track of the document names
X_train, X_test, y_train, y_test, train_docs, test_docs = train_test_split(
    text_data, labels, document_names, test_size=0.25, random_state=42
)

In [16]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train_tfidf, y_train)

In [18]:
# Predict the classes for the test set
y_pred = rf_model.predict(X_test_tfidf)
y_pred_prob = rf_model.predict_proba(X_test_tfidf)

In [19]:
# Create a DataFrame with predictions and confidence scores
predictions_df = pd.DataFrame({
    'Document': test_docs,
    'True Classification': y_test,
    'Predicted Classification': y_pred,
    'Probability of Containing Liquidation Preference Information': y_pred_prob[:, 1],
    'Probability of Not Containing Liquidation Preference Information': y_pred_prob[:, 0]
})
predictions_df

Unnamed: 0,Document,True Classification,Predicted Classification,Probability of Containing Liquidation Preference Information,Probability of Not Containing Liquidation Preference Information
0,28_2009-12-17_Certificates of Incorporation,1,1,1.0,0.0
1,24_2014-08-27_Certificates of Incorporation,1,1,1.0,0.0
2,48_2004-10-19_Certificates of Incorporation,1,1,1.0,0.0
3,49_2008-06-12_Certificates of Incorporation,1,1,1.0,0.0
4,16_2003-07-03_Certificates of Incorporation,0,0,0.06,0.94
5,27_2002-09-23_Certificates of Incorporation,0,0,0.1,0.9
6,28_2009-12-07_Certificates of Incorporation,0,0,0.07,0.93
7,48_2014-03-06_Certificates of Incorporation,0,0,0.42,0.58
8,16_2012-12-17_Certificates of Incorporation,0,0,0.14,0.86
9,34_2010-01-28_Certificates of Incorporation,0,0,0.06,0.94


In [20]:
# display summary of classification results
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



## Part 2. Remove all text except for areas that contain liquidation preference

In [24]:
# import other packages
from sentence_transformers import SentenceTransformer, util

ModuleNotFoundError: No module named 'sentence_transformers'

In [37]:
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define example phrases for liquidation preferences for semantic similarity
example_phrases = [
    "$0.624136 per share for each share of the Series B Preferred Stock ",
    "$0.47455 per share for each share of the Series C Preferred Stock",
    "$0.60 per share for the Series A Preferred Stock", 
    "$1.40 per share for the Series B Preferred Stock",
    "$2.00 per share for the Series C Preferred Stock",
    "$3.14 per share of Series F Preferred Stock",
    "$1.00 per share in the case of the Series A Preferred Stock",
    "$1.50 per share in the case of the Series A-1 Preferred Stock",
    "$2.078192 per share in the case of the Series B Preferred Stock",
    "$3.33 per share in the case of the Series B-1 Preferred Stock",
    "$3.371016 per share in the case of the Series C Preferred Stock",
    "$6.56063 per share in the case of the Series D Preferred Stock",
    "1.25 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series A Preferred Stock",
    "$1.847 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series B Preferred Stock",
    "$2.38 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series C Preferred Stock",
    "$3.547 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series D Preferred Stock",
    "$5.10 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series E Preferred Stock,"
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases, convert_to_tensor=True)

In [38]:
# Function to extract relevant chunks and compute similarity scores
def extract_relevant_chunks_and_scores(doc_text, example_embeddings, threshold=0.6):
    sentences = doc_text.split(".")  # Split the document into sentences
    relevant_chunks = []
    similarity_scores = []

    for sentence in sentences:
        sentence_embedding = semantic_model.encode(sentence, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(sentence_embedding, example_embeddings)
        
        # Extract maximum similarity score for this sentence
        max_score = cosine_scores.max().item()
        
        # If the similarity score is above the threshold, consider it relevant
        if max_score > threshold:
            relevant_chunks.append(sentence.strip())
            similarity_scores.append(max_score)

    return relevant_chunks, similarity_scores

In [39]:
# Create a DataFrame to store the results
results = []

# Loop over each document predicted to contain liquidity preference information
for _, row in predictions_df.iterrows():
    if row['Predicted Classification'] == 1:  # Document predicted to contain liquidity preference information
        file_name = row['Document']
        
        # Retrieve the corresponding text from the training set
        file_path = os.path.join(txt_folder_path, file_name + ".txt")
        with open(file_path, "r", encoding="utf-8") as file:
            doc_text = file.read()
        
        # Extract relevant chunks and similarity scores
        relevant_chunks, similarity_scores = extract_relevant_chunks_and_scores(doc_text, example_embeddings)
        
        # Store the results in the DataFrame
        results.append({
            'Document': file_name,
            'Relevant text': relevant_chunks,
            'Similarity Score': similarity_scores
        })

In [40]:
# Convert results to a DataFrame
relevant_df = pd.DataFrame(results)
relevant_df

Unnamed: 0,Document,Relevant text,Similarity Score
0,35_2007-06-20_Certificates of Incorporation,[The total number of shares of all classes of ...,"[0.6020973920822144, 0.6805183291435242, 0.736..."
1,21_2006-04-21_Certificates of Incorporation,[The total number of shares of Common\nStock t...,"[0.6060568690299988, 0.6506026387214661, 0.725..."
2,43_2005-10-31_Certificates of Incorporation,[78204\nshares of Common Stock of the Corporat...,"[0.6401557922363281, 0.7011102437973022, 0.739..."
3,24_2014-08-27_Certificates of Incorporation,[001 par value per share\n(“Common Stock”) and...,"[0.7099118232727051, 0.7106842398643494, 0.739..."
4,16_2006-03-09_Certificates of Incorporation,[The total number of shares of\ncommon stock a...,"[0.6076784729957581, 0.7061084508895874, 0.686..."
5,34_2008-09-29_Certificates of Incorporation,"[001 per share (the\n“Common Stock”), The tota...","[0.6967094540596008, 0.7127175331115723, 0.739..."
6,81_2010-06-10_Certificates of Incorporation,"[001 per share (the “Common Stock”), 26,069,98...","[0.7630836963653564, 0.8245648741722107, 0.743..."
7,28_2009-12-17_Certificates of Incorporation,"[001 par value per share, and 45,864,172 share...","[0.7541942596435547, 0.6897203922271729, 0.933..."
8,81_2007-10-23_Certificates of Incorporation,[FOURTH: The total number of shares of all cla...,"[0.61246657371521, 0.7660892605781555, 0.82706..."
9,92_2007-12-20_Certificates of Incorporation,"[0001 per\nshare, Forty-Two Million (42,000,00...","[0.6430851221084595, 0.6086916923522949, 0.603..."


In [45]:
relevant_df['Relevant text'][0]

['The total number of shares of all classes of capital stock that\nthe Company shall have authority to issue is thirty million eight hundred nineteen thousand two\nhundred fifty-two (30,819,252) of which twenty million (20,000,000) shares, par value of one-\ntenth of one cent ($0',
 '001) per share, shall be Common Stock (the “Common Stock”) and ten\nmillion eight hundred nineteen thousand two hundred fifty-two (10,819,252) shares, par value of\none-tenth of one cent ($0',
 '001) per share, shall be Preferred Stock (the “Preferred Stock”)',
 'The Preferred Stock shall be divided into series',
 'The first series shall be designated\n“Series A Preferred Stock” and shall consist of one million seventeen thousand one hundred\nthirty-seven (1,017,137) shares',
 'The second series shall be designated “Series B Preferred\nStock” and shall consist of four million fifty thousand three hundred seventy-five (4,050,375)\nshares',
 'The third series shall be designated “Series C Preferred Stock” an

In [47]:
relevant_df['Similarity Score'][0]

[0.6020973920822144,
 0.6805183291435242,
 0.7363404631614685,
 0.6869385838508606,
 0.7333046197891235,
 0.7947565317153931,
 0.7641100883483887,
 0.7186769247055054,
 0.6766209602355957,
 0.7433762550354004,
 0.609413743019104,
 0.6905837059020996,
 0.7475104331970215,
 0.609413743019104,
 0.6075901389122009,
 0.6731525659561157,
 0.745444118976593,
 0.609413743019104,
 0.6051792502403259,
 0.6917006373405457,
 0.8156101703643799,
 0.8610067367553711,
 0.8502562642097473,
 0.6878525018692017,
 0.792844831943512,
 0.6154287457466125,
 0.6323366761207581,
 0.6646072864532471,
 0.6592193245887756,
 0.6634230017662048,
 0.7529460787773132,
 0.6234710216522217,
 0.6669771671295166,
 0.6229426264762878,
 0.6049442291259766,
 0.6143702268600464,
 0.6177152991294861,
 0.6322834491729736,
 0.6888585686683655,
 0.6095964908599854,
 0.6153066158294678]

In [49]:
# Get the relevant text and similarity scores for row 0
relevant_text_row_0 = relevant_df['Relevant text'][0]
similarity_scores_row_0 = relevant_df['Similarity Score'][0]

# Find the index of the maximum similarity score
max_score_index = similarity_scores_row_0.index(max(similarity_scores_row_0))

# Get the corresponding text chunk with the maximum similarity score
max_similarity_text = relevant_text_row_0[max_score_index]
max_similarity_score = similarity_scores_row_0[max_score_index]
max_similarity_text

'16 for each outstanding share of Series C Preferred Stock (the “Original Series C\nIssue Price”) and an amount per share equal to $1'

## Part 3. Extract the specific liquidation preference of each document

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the Legal-BERT model for NER
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased")
nlp_legal = pipeline("ner", model=model, tokenizer=tokenizer)

# Initialize Sentence-BERT for semantic similarity
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example liquidation preference phrases for semantic search
example_phrases = [
    "0 20 for each share of Series A Preferred",
    "0 34 for each share of Series B Preferred",
    "Liquidation Preference shall mean 0 40 per share for the Series A Preferred Stock",
    "Series A Preferred Stock by reason of their ownership thereof an amount per share equal to the sum of A 1 00",
    "Series A Original Purchase Price shall be 0 40 per share of Series A Preferred Stock",
    "Series C Original Purchase Price shall be 1 74417185 per share of Series C Preferred Stock"
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases)

MAX_SEQ_LENGTH = 512

def is_relevant_semantic(chunk, threshold=0.7):
    """
    Identify if the chunk is contextually similar to liquidation preference examples.
    """
    chunk_embedding = semantic_model.encode(chunk)
    similarity_scores = util.cos_sim(chunk_embedding, example_embeddings)
    max_similarity = similarity_scores.max().item()
    return max_similarity >= threshold

def extract_liquidation_preferences(chunk):
    """
    Extracts exact liquidation preferences from a given text chunk.
    Uses both Legal-BERT NER and Sentence-BERT for relevance matching.
    """
    extracted_preferences = []

    # Use Legal-BERT's NER to detect relevant legal entities
    ner_results = nlp_legal(chunk)
    
    # Filter entities that are related to money, stock, or terms relevant to liquidation
    money_entities = [result['word'] for result in ner_results if result['entity'] in ["PER", "ORG", "LOC"]]  # Modify based on the entity list from legal-BERT
    stock_entities = [result['word'] for result in ner_results if result['entity'] in ["ORG", "PRODUCT"]]

    # If money and stock-related entities are detected, append the chunk as a potential liquidation preference
    if money_entities and stock_entities:
        extracted_preferences.append({
            'money': money_entities,
            'stock': stock_entities
        })

    # Use Sentence-BERT for contextual matching with predefined liquidation preference phrases
    if is_relevant_semantic(chunk):
        extracted_preferences.append(chunk)

    return extracted_preferences

# Example processing for document chunks
data = []

# Assuming dataset_df has columns 'Document' and 'Relevant Chunks'
for _, row in dataset_df.iterrows():
    file_name = row['Document']
    relevant_chunks = row['Relevant Chunks']
    
    preferences = []
    for chunk in relevant_chunks:
        preferences.extend(extract_liquidation_preferences(chunk))
    
    # Remove duplicates by checking if the preference already exists
    unique_preferences = []
    for pref in preferences:
        if pref not in unique_preferences:
            unique_preferences.append(pref)
    
    data.append({"Document": file_name, "Liquidation Preferences": unique_preferences})

# Create a DataFrame with the extracted liquidation preferences
preferences_df = pd.DataFrame(data)

# Display the resulting DataFrame
preferences_df


Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0,Document,Liquidation Preferences
0,35_2007-06-20_Certificates of Incorporation,[es shall be designated “Series B Preferred\nS...
1,21_2006-04-21_Certificates of Incorporation,[]
2,43_2005-10-31_Certificates of Incorporation,[of the Corporation into such number of fully ...
3,24_2014-08-27_Certificates of Incorporation,[ made to the holders of Series B Preferred St...
4,16_2006-03-09_Certificates of Incorporation,[]
5,34_2008-09-29_Certificates of Incorporation,"[tock shall mean\n$0.834375 per share, the “Or..."
6,81_2010-06-10_Certificates of Incorporation,[o\nand including the date full payment shall ...
7,28_2009-12-17_Certificates of Incorporation,"[3,601 shares of Common Stock, $0.001 par valu..."
8,81_2007-10-23_Certificates of Incorporation,[had such share been converted into Common\nSt...


In [29]:
preferences_df['Liquidation Preferences'][3]

[' made to the holders of Series B Preferred Stock, Series\nA Preferred Stock or Common Stock, an amount per share of Series C Preferred Stock held by\nsuch holder equal to the Series C Original Purchase Price (which amount shall be subject to\nequitable adjustment whenever there shall occur a stock dividend, stock split, combination of\nshares, reclassification or other similar event with respect to the Series C Preferred Stock) plus\nan amount equal to all accrued and/or declared and unpaid dividends on the Seri',
 'if the amount a holder of Preferred Stock would receive\nwith respect to such shares would be greater if such shares were converted to Common Stock\nimmediately prior to such liquidation, dissolution, winding up or Deemed Liquidation Event, the\nholder of such shares will be paid that higher amount in lieu of payments called for by\nsubsections 2(a), 2(b) and 2(c).\n\n(iii) |The amount in the aggregate that the holders of Series D Preferred\nStock are entitled to receive 

- amendments: supplemental to original charter
- restatements: new version of charter with the amendment in it
- consider organization of data now because difference between amendments and restatements
- reorganize data now to consider and adjust for unique commpanies/amendments and restatements

Approach: 
1. Scrape all company names, title of certificate, and date
2. For each company, if the latest title of certificate is not "Certificate of Incorporation", "Restated Certificate of Incorporation", or "Amended and Restated Certificate of Incorporation", then find the latest certificate that is and save both the latest file and the closes one that meets previous criteria; else save the latest certificate
3. Find the liquidation preferences for each document pertaining to each company based on most recent COI and supplemental documents if necessary

Questions:
- What is the ultimate metric we want to capture? (liquidation preference for each unique company?)
- How and where are we storing all this data? 