# liquidity model preference

## Organize CSVs into time-series dataframe

In [1]:
# Imports
import os
import re
import logging
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
import nltk
import torch



In [2]:
# Setup
nltk.download('punkt')
nltk.download('stopwords')
logging.basicConfig(level=logging.WARNING)

[nltk_data] Downloading package punkt to /Users/alexchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# --- Paths ---
batch1_labeled_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - [Readable] Batch 1 Main.csv"
batch1_lp_path = "/Users/alexchen/Downloads/Projects/vc-research/URAP VC Research - Batch 1 Details.csv"
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"

In [4]:
# --- Load data ---
batch1_labeled = pd.read_csv(batch1_labeled_path)
batch1_lp = pd.read_csv(batch1_lp_path)

## Part 1. Classify which text files contain liquidation preference or not

In [5]:
# --- Preprocessing ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def prepare_multiindex(df, date_col="Date", group_cols=["Company Name"]):
    df[date_col] = pd.to_datetime(df[   date_col])
    df = df.set_index(group_cols + [date_col]).sort_index()
    df = df.groupby(level=0, sort=False).apply(lambda x: x.sort_index(level=1))
    df.index = df.index.droplevel(0)
    return df

batch1_labeled_multiindex = prepare_multiindex(batch1_labeled)
batch1_lp_multiindex = prepare_multiindex(batch1_lp)

In [6]:
# --- Load and preprocess text data ---
text_data, labels, document_names = [], [], []
for _, row in batch1_labeled.iterrows():
    file_name = row['File Name']
    label = row['Contains Liquidity Preference']
    file_path = os.path.join(txt_folder_path, file_name + ".txt")
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text = clean_text(file.read())
            text_data.append(text)
            labels.append(label)
            document_names.append(file_name)
    else:
        logging.warning(f"File not found: {file_path}")

In [7]:
# --- Split and vectorize text ---
X_train, X_test, y_train, y_test, train_docs, test_docs = train_test_split(
    text_data, labels, document_names, test_size=0.25, stratify=labels, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=2000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_bert = [bert_model.encode(sent_tokenize(doc)).mean(axis=0) for doc in X_train]
X_test_bert = [bert_model.encode(sent_tokenize(doc)).mean(axis=0) for doc in X_test]

X_train_combined = np.hstack([X_train_tfidf.toarray(), np.array(X_train_bert)])
X_test_combined = np.hstack([X_test_tfidf.toarray(), np.array(X_test_bert)])

In [8]:
# --- Train RandomForest ---
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_model = GridSearchCV(RandomForestClassifier(random_state=42, class_weight="balanced"), param_grid, cv=5, n_jobs=-1)
rf_model.fit(X_train_combined, y_train)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
# --- Evaluate Model ---
y_pred = rf_model.best_estimator_.predict(X_test_combined)
y_pred_prob = rf_model.best_estimator_.predict_proba(X_test_combined)

predictions_df = pd.DataFrame({
    'Document': test_docs,
    'True Classification': y_test,
    'Predicted Classification': y_pred,
    'Confidence Score': y_pred_prob[:, 1]
})

In [10]:
predictions_df

Unnamed: 0,Document,True Classification,Predicted Classification,Confidence Score
0,48_2013-12-06_Certificates of Incorporation,1,1,0.995
1,27_2004-08-17_Certificates of Incorporation,0,0,0.0
2,27_2006-08-30_Certificates of Incorporation,1,1,0.985
3,24_2004-12-01_Certificates of Incorporation,0,0,0.075
4,16_2015-04-22_Certificates of Incorporation,0,0,0.015
5,16_2007-05-16_Certificates of Incorporation,1,1,1.0
6,81_2010-03-17_Certificates of Incorporation,0,0,0.015
7,81_2010-06-10_Certificates of Incorporation,1,1,0.995
8,35_2018-02-23_Certificates of Incorporation,1,1,1.0
9,16_2009-01-20_Certificates of Incorporation,1,1,1.0


## Label and extract relevant sentences

In [11]:
# Load base BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Define Property Tags and Heuristics ---
PROPERTY_TAGS = ['Company Name', 'Date', 'Document Type', 'Preferred Stocks', 'Priority Order', 'Liquidation Value']
KEYWORDS = {
    'Company Name': ["certificate of incorporation", "incorporated", "corporation", "company name"],
    'Date': ["filed", "effective date", r"\d{2}/\d{2}/\d{4}"],
    'Document Type': ["certificate of amendment", "articles of incorporation", "amended and restated"],
    'Preferred Stocks': ["preferred stock", "series a", "series b"],
    'Priority Order': ["prior and in preference", "ranking junior", "paid before"],
    'Liquidation Value': ["liquidation preference", "liquidation value", "entitled to", "distribution", r"\$[0-9]+\.?[0-9]*"]
}

In [12]:
# --- Heuristic Labeling Function ---
def label_sentences_heuristically(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read().replace("\n", " ")
                sentences = sent_tokenize(text)
                for sentence in sentences:
                    tags = []
                    for tag, keywords in KEYWORDS.items():
                        if any(re.search(kw, sentence, re.IGNORECASE) for kw in keywords):
                            tags.append(tag)
                    if tags:
                        data.append({"Filename": filename, "Sentence": sentence, "Labels": ", ".join(tags)})
    return pd.DataFrame(data)

In [13]:
# --- Build Training Examples ---
def build_training_examples(labeled_df):
    examples = []
    for _, row in labeled_df.iterrows():
        for tag in PROPERTY_TAGS:
            sentence = str(row['Sentence'])
            label = 1 if tag in row['Labels'] else 0
            examples.append(InputExample(texts=[sentence, tag], label=float(label)))
    return examples

In [14]:
# --- Fine-Tune the Model ---
def fine_tune_bert_model(bert_model, training_examples):
    train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=32)  # larger batch size = fewer steps
    train_loss = losses.CosineSimilarityLoss(model=bert_model)
    bert_model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,                    # just 1 epoch to reduce training time
        warmup_steps=5,              # smaller warmup
        show_progress_bar=True       # feedback for long runs
    )
    return bert_model

In [15]:
# --- Classify Sentences by Tag ---
def classify_sentences(sentences, model, threshold=0.5):
    results = defaultdict(list)
    for sentence in sentences:
        for tag in PROPERTY_TAGS:
            score = util.cos_sim(model.encode(sentence), model.encode(tag))[0][0].item()
            if score >= threshold:
                results[tag].append((sentence, score))
    return results

In [16]:
# --- Extract Tagged Sentences from a Document ---
def extract_relevant_sentences_from_document(text, model, threshold=0.5):
    sentences = sent_tokenize(text.replace('\n', ' '))
    return classify_sentences(sentences, model, threshold)

In [17]:
# --- Process a Directory of Text Files ---
def process_directory_with_model(folder_path, model, threshold=0.5):
    records = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
                results = extract_relevant_sentences_from_document(text, model, threshold)
                record = {"Filename": filename}
                for tag in PROPERTY_TAGS:
                    sentences = results.get(tag, [])
                    record[tag] = "; ".join([s for s, _ in sentences])
                records.append(record)
    return pd.DataFrame(records)

In [18]:
# --- Heuristically Label, Fine-Tune, and Apply ---
folder = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
labeled_data = label_sentences_heuristically(folder)
examples = build_training_examples(labeled_data)
fine_tuned_model = fine_tune_bert_model(bert_model, examples)
extracted_df = process_directory_with_model(folder, fine_tuned_model)
extracted_df

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0569
1000,0.0329


Unnamed: 0,Filename,Company Name,Date,Document Type,Preferred Stocks,Priority Order,Liquidation Value
0,45_2008-01-17_Certificates of Incorporation.txt,State of Delaware Secre of State Division of C...,State of Delaware Secre of State Division of C...,FILED 01:42 PM 01/17/2008 AMENDED AND RESTATED...,The Company is authorized to issue two classes...,,The Preferred Stock shall have a par value of ...
1,16_2015-04-22_Certificates of Incorporation.txt,State of Delaware Secreta pe arate = Division ...,Delivered 05:19 PM 04/22/2015 FILED 05:05 PM 0...,,,,
2,28_2009-12-17_Certificates of Incorporation.txt,State of Delaware Secre of State Division of C...,State of Delaware Secre of State Division of C...,State of Delaware Secre of State Division of C...,ARTICLE IV The total number of shares of stoc...,,ARTICLE IV The total number of shares of stoc...
3,34_2010-01-28_Certificates of Incorporation.txt,of State Division of Corporations Delivered 02...,of State Division of Corporations Delivered 02...,"Article TV, Paragraph A. of the Certificate of...",Authorization of Stock.; This corporation is a...,,Authorization of Stock.; The total number of s...
4,27_2006-08-23_Certificates of Incorporation.txt,A0b493b7 10cs | 1Ho78 cake A | 1 Office of the...,"Dated: August 23, 2006",A0b493b7 10cs | 1Ho78 cake A | 1 Office of the...,Authorized Shares.; The Company is authorized ...,,Authorized Shares.; The total number of shares...
...,...,...,...,...,...,...,...
85,28_2007-06-15_Certificates of Incorporation.txt,State of Delaware Secre of State Division of C...,State of Delaware Secre of State Division of C...,State of Delaware Secre of State Division of C...,"and 33,952,073 shares of Preferred Stock, $0.0...",,ARTICLE IV The total number of shares of stoc...
86,49_2007-01-23_Certificates of Incorporation.txt,State of Delaware Secretary of State Division ...,State of Delaware Secretary of State Division ...,State of Delaware Secretary of State Division ...,FOURTH: The total number of shares of all clas...,,FOURTH: The total number of shares of all clas...
87,16_2006-03-09_Certificates of Incorporation.txt,State of Delaware Secretary of State Division ...,State of Delaware Secretary of State Division ...,State of Delaware Secretary of State Division ...,"ARTICLE IV A, Authorization of Stock.; This co...",,"ARTICLE IV A, Authorization of Stock.; The tot..."
88,16_2003-07-03_Certificates of Incorporation.txt,State of Delaware Secretary of State Division ...,State of Delaware Secretary of State Division ...,,,,


In [19]:
extracted_df['Liquidation Value'][54]

'FOURTH: The total number of shares of all classes of stock which the Corporation has authority to issue is ] 78,090,388 shares, consisting of 104,013,161 shares of Common Stock, par value $.001 per share (the “Common Stock”), 26,069,980 shares of Series A Convertible Preferred Stock, par value $.001 per share (the “Series A Preferred Stock”), 16,944,378 shares of Series B Convertible Preferred Stock, par value $.001] per share (the “Series B Preferred Stock”’), 11,923,077 shares of Series C Convertible Preferred Stock, par value $.001 per share (the “Series C Stock”), 2,014,652 shares of Series C-1 Convertible Preferred Stock, par value $.001 per share  28664817_9 (the “Series C-} Stock” and, together with the Series C Stock, the “Series C Preferred Stock”), 955,414 shares of Series D Convertible Preferred Stock, par value $.001 per share (the “Series D Stock”), 2,802,548 shares of Series D-1 Convertible Preferred Stock, par value $.001 per share (the “Series D-1 Stock” and, together 

In [20]:
extracted_df.to_csv('Extracted Sentences - Batch 1.csv', index=False)

ChatGPT Prompt:

Based on the strings in each of the cells, isolate just the desired information as describe below: 
File Name: Do not modify values in this column
Company Name: Identify and extract the company's name as a string type (Example: "3VR Security INC.", "The 41st Parameter INC.", etc.)
Date: Identify and extract the date when the article was filed as a datetime type (Example: "FILED 10:43 AM 06/28/2007", "FILED 05:05 PM 06/10/2010", etc.)
Document Type: Identify and extract the type of document that was submitted as a string type (Example: "Certificate of Incorporation", "Amended and Restated Certificate of Incorporation", etc.) 
Preferred Stock: Identify and extract the unique types of preferred shares as a list of strings (Example: ['Series A', 'Series B', 'Series C', 'Series D'])
Liquidation Value: Identify and extract the dollar liquidation amount for each preferred stock as a list of floats; the length of the list should be the same length as the list for preferred stocks; if the liquidation preference is the original issue price use that value (Example: [0.431469, 0.624136, 0.474550, 0.152430])

Return the result after this extraction in the form of a dataframe and then export as a CSV

## Extract desired information from labeled sentences

In [None]:
# # Load base BERT model
# bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# # --- Define Property Tags ---
# PROPERTY_TAGS = ['Company Name', 'Date', 'Document Type', 'Preferred Stocks', 'Priority Order', 'Liquidation Value']

# # --- Build Training Examples ---
# def build_training_examples(labeled_df):
#     examples = []
#     for _, row in labeled_df.iterrows():
#         for tag in PROPERTY_TAGS:
#             sentence = str(row['Sentence'])
#             label = 1 if tag in row['Labels'] else 0
#             examples.append(InputExample(texts=[sentence, tag], label=float(label)))
#     return examples

# # --- Fine-Tune the Model ---
# def fine_tune_bert_model(bert_model, training_examples):
#     train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=16)
#     train_loss = losses.CosineSimilarityLoss(model=bert_model)
#     bert_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=10)
#     return bert_model

# # --- Classify Sentences by Tag ---
# def classify_sentences(sentences, model, threshold=0.5):
#     results = defaultdict(list)
#     for sentence in sentences:
#         for tag in PROPERTY_TAGS:
#             score = util.cos_sim(model.encode(sentence), model.encode(tag))[0][0].item()
#             if score >= threshold:
#                 results[tag].append((sentence, score))
#     return results

# # --- Extract Tagged Sentences from a Document ---
# def extract_relevant_sentences_from_document(text, model, threshold=0.5):
#     sentences = sent_tokenize(text.replace('\n', ' '))
#     return classify_sentences(sentences, model, threshold)

# # --- Process a Directory of Text Files ---
# def process_directory_with_model(folder_path, model, threshold=0.5):
#     records = []
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".txt"):
#             with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
#                 text = f.read()
#                 results = extract_relevant_sentences_from_document(text, model, threshold)
#                 record = {"Filename": filename}
#                 for tag in PROPERTY_TAGS:
#                     sentences = results.get(tag, [])
#                     record[tag] = "; ".join([s for s, _ in sentences])
#                 records.append(record)
#     return pd.DataFrame(records)

# # --- Fine-Tune and Apply the Model ---
# labeled_data = pd.read_csv("/Users/alexchen/Downloads/Projects/vc-research/labeled_sentences.csv")
# examples = build_training_examples(labeled_data)
# fine_tuned_model = fine_tune_bert_model(bert_model, examples)
# folder = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
# extracted_df = process_directory_with_model(folder, fine_tuned_model)
# print(extracted_df)

focus on extracting just relevant text right now, not structured in a dataframe
move on to other provisions and getting the relevant text areas
