# liquidity model preference

## Part 1. Classify which text files contain liquidation preference or not

In [1]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Ensure you have the NLTK stopwords downloaded
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the CSV file
csv_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
df = pd.read_csv(csv_path)

# Load text files and associate them with labels
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
text_data = []
labels = []
document_names = []

In [3]:
df

Unnamed: 0,Document,Contains Liquidity Preference,Priority Order,Number of Common Stocks Issued,Number of Preferred Stock Issued,Total Number of Stocks Issued,Common Stock Par Value Per Share,Preferred Stock Par Value Per Share
0,16_2003-07-03_Certificates of Incorporation,0,,1500,,1500,$0.0000,
1,16_2004-01-22_Certificates of Incorporation,0,,6250000,,6250000,$0.0001,
2,16_2004-07-14_Certificates of Incorporation,1,A,20500000,9500000,30000000,$0.0001,$0.0001
3,16_2005-05-18_Certificates of Incorporation,0,,,,0,,
4,16_2006-03-09_Certificates of Incorporation,1,A=B,25000000,14270662,39270662,$0.0001,$0.0001
...,...,...,...,...,...,...,...,...
84,92_2004-11-23_Certificates of Incorporation,1,A=B=C,19375000,11839309,31214309,$0.0001,$0.0001
85,92_2007-12-20_Certificates of Incorporation,1,A=B=C=D=E,42000000,28443627,70443627,$0.0001,$0.0001
86,92_2010-02-23_Certificates of Incorporation,1,A=B=C=D=E,49000000,32325882,81325882,$0.0001,$0.0001
87,100_2007-02-22_Certificates of Incorporation,1,A=B=C,60000000,38416115,98416115,$0.0010,$0.0010


In [4]:
# Read each file and extract data
for _, row in df.iterrows():
    file_name = row['Document']
    label = row['Contains Liquidity Preference']
    file_path = os.path.join(txt_folder_path, file_name + ".txt")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            text_data.append(text)
            labels.append(label)
            document_names.append(file_name)  # Append the document name
    else:
        print(f"File not found: {file_path}")

In [5]:
# Split the data into training and test sets, while keeping track of the document names
X_train, X_test, y_train, y_test, train_docs, test_docs = train_test_split(
    text_data, labels, document_names, test_size=0.2, random_state=42
)

In [6]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train_tfidf, y_train)

In [8]:
# Predict the classes for the test set
y_pred = rf_model.predict(X_test_tfidf)
y_pred_prob = rf_model.predict_proba(X_test_tfidf)

In [9]:
# Create a DataFrame with predictions and confidence scores
predictions_df = pd.DataFrame({
    'Document': test_docs,
    'True Classification': y_test,
    'Predicted Classification': y_pred,
    'Probability of Containing Liquidation Preference Information': y_pred_prob[:, 1],
    'Probability of Not Containing Liquidation Preference Information': y_pred_prob[:, 0]
})
predictions_df

Unnamed: 0,Document,True Classification,Predicted Classification,Probability of Containing Liquidation Preference Information,Probability of Not Containing Liquidation Preference Information
0,35_2007-06-20_Certificates of Incorporation,1,1,1.0,0.0
1,48_2004-10-08_Certificates of Incorporation,0,0,0.0,1.0
2,27_2006-08-23_Certificates of Incorporation,0,0,0.02,0.98
3,21_2006-04-21_Certificates of Incorporation,1,1,0.94,0.06
4,43_2005-10-31_Certificates of Incorporation,1,1,0.96,0.04
5,16_2003-07-03_Certificates of Incorporation,0,0,0.13,0.87
6,48_2014-03-06_Certificates of Incorporation,0,0,0.35,0.65
7,24_2009-06-12_Certificates of Incorporation,0,0,0.0,1.0
8,16_2012-12-17_Certificates of Incorporation,0,0,0.04,0.96
9,24_2014-08-27_Certificates of Incorporation,1,1,1.0,0.0


In [10]:
# display summary of classification results
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         9

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



## Part 2. Remove all text except for areas that contain liquidation preference

In [11]:
# import other packages
import spacy
from sentence_transformers import SentenceTransformer, util

In [12]:
# Load spaCy's NER model and Sentence-BERT model
nlp = spacy.load("en_core_web_sm")
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define example phrases for liquidation preferences for semantic similarity
example_phrases = [
    "$0.624136 per share for each share of the Series B Preferred Stock ",
    "$0.47455 per share for each share of the Series C Preferred Stock",
    "$0.60 per share for the Series A Preferred Stock", 
    "$1.40 per share for the Series B Preferred Stock",
    "$2.00 per share for the Series C Preferred Stock",
    "$3.14 per share of Series F Preferred Stock",
    "$1.00 per share in the case of the Series A Preferred Stock",
    "$1.50 per share in the case of the Series A-1 Preferred Stock",
    "$2.078192 per share in the case of the Series B Preferred Stock",
    "$3.33 per share in the case of the Series B-1 Preferred Stock",
    "$3.371016 per share in the case of the Series C Preferred Stock",
    "$6.56063 per share in the case of the Series D Preferred Stock",
    "1.25 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series A Preferred Stock",
    "$1.847 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series B Preferred Stock",
    "$2.38 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series C Preferred Stock",
    "$3.547 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series D Preferred Stock",
    "$5.10 per share (as adjusted for stock splits, stock dividends, reclassification and the like) for each share of Series E Preferred Stock,"
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases)

MAX_SEQ_LENGTH = 512


In [13]:
# Function to check relevance using NER
def is_relevant_ner(chunk):
    """
    Identify if the chunk contains relevant entities using NER.
    """
    doc = nlp(chunk)
    for ent in doc.ents:
        if ent.label_ in ["MONEY", "LAW", "ORDINAL"]:  # Focus on monetary/legal terms
            return True
    return False

In [14]:
# Function to check relevance using Semantic Search
def is_relevant_semantic(chunk, threshold=0.7):
    """
    Identify if the chunk is contextually similar to liquidation preference examples.
    """
    chunk_embedding = semantic_model.encode(chunk)
    similarity_scores = util.cos_sim(chunk_embedding, example_embeddings)
    max_similarity = similarity_scores.max().item()
    return max_similarity >= threshold

In [15]:
# Function to filter chunks based on combined NER and semantic search
def filter_relevant_chunks(chunks):
    """
    Filter chunks using both NER and semantic similarity.
    """
    relevant_chunks = []
    for chunk in chunks:
        if is_relevant_ner(chunk) or is_relevant_semantic(chunk):
            relevant_chunks.append(chunk)
    return relevant_chunks

In [16]:
# Prepare the final dataset with relevant chunks stored as a list
data = []

# Process only documents predicted to contain liquidation preferences (Predicted Classification = 1)
for _, row in predictions_df.iterrows():
    file_name = row['Document']
    label = row['Predicted Classification']

    # Only process documents where the predicted label is 1
    if label == 1:
        file_path = os.path.join(txt_folder_path, file_name + ".txt")
        
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
                
                # Split the text into chunks
                chunks = []
                for i in range(0, len(text), MAX_SEQ_LENGTH):
                    chunks.append(text[i:i + MAX_SEQ_LENGTH])
                
                # Filter the chunks to retain only relevant ones
                relevant_chunks = filter_relevant_chunks(chunks)
                
                # Add a single entry for the document with relevant chunks as a list
                data.append({'Document': file_name, 'Relevant Chunks': relevant_chunks})


In [17]:
# Create a DataFrame from the filtered chunks
dataset_df = pd.DataFrame(data)
dataset_df

Unnamed: 0,Document,Relevant Chunks
0,35_2007-06-20_Certificates of Incorporation,[ns of the Certificate of Incorporation of the...
1,21_2006-04-21_Certificates of Incorporation,"[ Board of Directors of this Corporation, file..."
2,43_2005-10-31_Certificates of Incorporation,"[ust 20, 2004.\n\nARTICLE ]\n\nThe name of the..."
3,24_2014-08-27_Certificates of Incorporation,"[ed on May 5, 2011, and as further amended on\..."
4,16_2006-03-09_Certificates of Incorporation,[cers of this corporation to solicit the conse...
5,34_2008-09-29_Certificates of Incorporation,[oration to solicit the consent of the\nstockh...
6,81_2010-06-10_Certificates of Incorporation,[ich the Corporation has\nauthority to issuc i...
7,28_2009-12-17_Certificates of Incorporation,[ engage in any lawful act or activity for whi...
8,81_2007-10-23_Certificates of Incorporation,[nature of the business or purposes to be cond...


In [18]:
dataset_df['Relevant Chunks'][7]

[' engage in any lawful act or activity for which corporations may\nbe organized under the Delaware General Corporation Law (*“DGCL”).\n\nARTICLE Ill\n\nThe address of the Corporation’s reyistered office in the State of Delaware is 1209 Orange Street,\nCity of Wilmington, County of New Castle, 19801. The name of the registered ayent at such address is The\nCorporation Trust Company.\n\nARTICLE IV\n\nThe total number of shares of stock that the Corporation shall have authority to issue is 127,447,773,\nconsisting of 81,58',
 '3,601 shares of Common Stock, $0.001 par value per share, and 45,864,172 shares of\nPreferred Stock, $0.001 par value per share, 9,819,635 of which shall be designated “Series A Preferred\nStock”, 13,132,438 of which shal! be designated “Series B Preferred Stock” and 22,912,099 of which shall\nbe designated “Series C Preferred Stock”. Fractional shares of Common Stock heretofore issued shall be\nrounded upwards to the nearest whole nuinber and no further fractional

## Part 3. Extract the specific liquidation preference of each document

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the Legal-BERT model for NER
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased")
nlp_legal = pipeline("ner", model=model, tokenizer=tokenizer)

# Initialize Sentence-BERT for semantic similarity
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example liquidation preference phrases for semantic search
example_phrases = [
    "0 20 for each share of Series A Preferred",
    "0 34 for each share of Series B Preferred",
    "Liquidation Preference shall mean 0 40 per share for the Series A Preferred Stock",
    "Series A Preferred Stock by reason of their ownership thereof an amount per share equal to the sum of A 1 00",
    "Series A Original Purchase Price shall be 0 40 per share of Series A Preferred Stock",
    "Series C Original Purchase Price shall be 1 74417185 per share of Series C Preferred Stock"
]

# Precompute embeddings for the example phrases
example_embeddings = semantic_model.encode(example_phrases)

MAX_SEQ_LENGTH = 512

def is_relevant_semantic(chunk, threshold=0.7):
    """
    Identify if the chunk is contextually similar to liquidation preference examples.
    """
    chunk_embedding = semantic_model.encode(chunk)
    similarity_scores = util.cos_sim(chunk_embedding, example_embeddings)
    max_similarity = similarity_scores.max().item()
    return max_similarity >= threshold

def extract_liquidation_preferences(chunk):
    """
    Extracts exact liquidation preferences from a given text chunk.
    Uses both Legal-BERT NER and Sentence-BERT for relevance matching.
    """
    extracted_preferences = []

    # Use Legal-BERT's NER to detect relevant legal entities
    ner_results = nlp_legal(chunk)
    
    # Filter entities that are related to money, stock, or terms relevant to liquidation
    money_entities = [result['word'] for result in ner_results if result['entity'] in ["PER", "ORG", "LOC"]]  # Modify based on the entity list from legal-BERT
    stock_entities = [result['word'] for result in ner_results if result['entity'] in ["ORG", "PRODUCT"]]

    # If money and stock-related entities are detected, append the chunk as a potential liquidation preference
    if money_entities and stock_entities:
        extracted_preferences.append({
            'money': money_entities,
            'stock': stock_entities
        })

    # Use Sentence-BERT for contextual matching with predefined liquidation preference phrases
    if is_relevant_semantic(chunk):
        extracted_preferences.append(chunk)

    return extracted_preferences

# Example processing for document chunks
data = []

# Assuming dataset_df has columns 'Document' and 'Relevant Chunks'
for _, row in dataset_df.iterrows():
    file_name = row['Document']
    relevant_chunks = row['Relevant Chunks']
    
    preferences = []
    for chunk in relevant_chunks:
        preferences.extend(extract_liquidation_preferences(chunk))
    
    # Remove duplicates by checking if the preference already exists
    unique_preferences = []
    for pref in preferences:
        if pref not in unique_preferences:
            unique_preferences.append(pref)
    
    data.append({"Document": file_name, "Liquidation Preferences": unique_preferences})

# Create a DataFrame with the extracted liquidation preferences
preferences_df = pd.DataFrame(data)

# Display the resulting DataFrame
preferences_df


Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0,Document,Liquidation Preferences
0,35_2007-06-20_Certificates of Incorporation,[es shall be designated “Series B Preferred\nS...
1,21_2006-04-21_Certificates of Incorporation,[]
2,43_2005-10-31_Certificates of Incorporation,[of the Corporation into such number of fully ...
3,24_2014-08-27_Certificates of Incorporation,[ made to the holders of Series B Preferred St...
4,16_2006-03-09_Certificates of Incorporation,[]
5,34_2008-09-29_Certificates of Incorporation,"[tock shall mean\n$0.834375 per share, the “Or..."
6,81_2010-06-10_Certificates of Incorporation,[o\nand including the date full payment shall ...
7,28_2009-12-17_Certificates of Incorporation,"[3,601 shares of Common Stock, $0.001 par valu..."
8,81_2007-10-23_Certificates of Incorporation,[had such share been converted into Common\nSt...


In [29]:
preferences_df['Liquidation Preferences'][3]

[' made to the holders of Series B Preferred Stock, Series\nA Preferred Stock or Common Stock, an amount per share of Series C Preferred Stock held by\nsuch holder equal to the Series C Original Purchase Price (which amount shall be subject to\nequitable adjustment whenever there shall occur a stock dividend, stock split, combination of\nshares, reclassification or other similar event with respect to the Series C Preferred Stock) plus\nan amount equal to all accrued and/or declared and unpaid dividends on the Seri',
 'if the amount a holder of Preferred Stock would receive\nwith respect to such shares would be greater if such shares were converted to Common Stock\nimmediately prior to such liquidation, dissolution, winding up or Deemed Liquidation Event, the\nholder of such shares will be paid that higher amount in lieu of payments called for by\nsubsections 2(a), 2(b) and 2(c).\n\n(iii) |The amount in the aggregate that the holders of Series D Preferred\nStock are entitled to receive 

- process pdf conversion on all batches
- get larger training set
- create confidence on classifying if document has liquidation preference or not
- excel spreadsheet with all relevant text relating to liquidation preference, priority of liquidation preference, each class of stock, and original issue price
- check if chat gpt can identify from spreadsheet