In [1]:
# Section 1: Imports and Configuration
import re
from pathlib import Path
import pandas as pd
import torch
from torch.utils.data import Dataset as TorchDataset
from sklearn.model_selection import train_test_split
from collections import defaultdict

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline
)



In [2]:
# Paths and constants
MAIN_CSV   = Path("/Users/alexchen/Downloads/Projects/vc-research/VC Research (Batch 2) - Batch 2 Main.csv")
KEY_CSV    = Path("/Users/alexchen/Downloads/Projects/vc-research/VC Research (Batch 2) - Key for Data.csv")
TXT_DIR    = Path("/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_readable")
MODEL_NAME = "allenai/scibert_scivocab_uncased"

In [3]:
# Section 2: Data Loading
df_main = pd.read_csv(MAIN_CSV)
df_main.columns = df_main.columns.str.strip()

def find_filename_column(cols):
    for candidate in ['File Name']:
        if candidate in cols:
            return candidate
    for col in cols:
        if 'file' in col.lower():
            return col
    if 'Unnamed: 0' in cols:
        return 'Unnamed: 0'
    raise KeyError(f"No filename column found. Columns: {cols}")

file_col = find_filename_column(df_main.columns)
df_main['fname'] = (
    df_main[file_col].astype(str).str.strip().apply(lambda x: Path(x).stem + '.txt')
)

txt_files = list(TXT_DIR.glob("*.txt"))
texts = {p.name: p.read_text() for p in txt_files}
df = df_main[df_main['fname'].isin(texts)]

In [4]:
# Section 3: Generate Character Spans
def find_span(text: str, value: str):
    idx = text.find(value)
    return (idx, idx + len(value)) if idx >= 0 else None

examples = []
fields = ['Company Name', 'Date', 'Document Type',
          'Preferred Stocks', 'Priority Order', 'Liquidation Value']
for _, row in df.iterrows():
    doc = texts[row['fname']]
    starts, ends, labels = [], [], []
    for field in fields:
        val = row.get(field)
        if pd.isna(val):
            continue
        tokens = [v.strip() for v in str(val).split(',')] if ',' in str(val) else [str(val).strip()]
        for tok in tokens:
            span = find_span(doc, tok)
            if span:
                s, e = span
                starts.append(s); ends.append(e)
                labels.append(field.replace(' ', '_'))
    if starts:
        examples.append({
            'text': doc,
            'span_starts': starts,
            'span_ends': ends,
            'span_labels': labels
        })

In [5]:
# Section 4: Tokenization & Dataset Preparation
config    = AutoConfig.from_pretrained(MODEL_NAME)
real_max  = config.max_position_embeddings or 512
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

unique_fields = sorted({lab for ex in examples for lab in ex['span_labels']})
bio_labels    = ['O'] + [f'{p}-{fld}' for fld in unique_fields for p in ('B','I')]
label2id      = {lab: i for i, lab in enumerate(bio_labels)}
id2label      = {i: lab for lab, i in label2id.items()}

encodings = []
for ex in examples:
    enc = tokenizer(
        ex['text'],
        padding='max_length',
        truncation=True,
        max_length=real_max,
        return_offsets_mapping=True
    )
    labels = [label2id['O']] * real_max
    offsets = enc.pop('offset_mapping')
    for s, e, fld in zip(ex['span_starts'], ex['span_ends'], ex['span_labels']):
        for i, (off_s, off_e) in enumerate(offsets):
            if off_s >= e: break
            if off_e <= s: continue
            tag = 'B' if off_s == s else 'I'
            labels[i] = label2id[f'{tag}-{fld}']
    enc['labels'] = labels
    encodings.append(enc)

class NERDataset(TorchDataset):
    def __init__(self, encs): self.encs = encs
    def __len__(self): return len(self.encs)
    def __getitem__(self, idx): return {k: torch.tensor(v) for k, v in self.encs[idx].items()}

train_encs, eval_encs = train_test_split(encodings, test_size=0.1, random_state=42)
train_dataset = NERDataset(train_encs)
eval_dataset  = NERDataset(eval_encs)

In [6]:
# Section 5: Model Initialization & Training
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(bio_labels), id2label=id2label, label2id=label2id
)
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir='out_ner',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='logs',
    do_train=True,
    do_eval=True,
    logging_steps=50,
    save_steps=100
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.0871


TrainOutput(global_step=84, training_loss=0.06028160169011071, metrics={'train_runtime': 87.0483, 'train_samples_per_second': 3.825, 'train_steps_per_second': 0.965, 'total_flos': 87014179998720.0, 'train_loss': 0.06028160169011071, 'epoch': 3.0})

In [7]:
# Section 6: Evaluation & Inference
tokenizer.model_max_length = real_max

# 6.1 Quantitative Evaluation
eval_results = trainer.evaluate(eval_dataset=eval_dataset)
print("=== Trainer.evaluate() results ===")
print(eval_results)

pred_logits, true_label_ids, _ = trainer.predict(eval_dataset)
pred_label_ids = torch.argmax(torch.tensor(pred_logits), dim=-1).tolist()
true_tags = [[id2label[id_] for id_ in seq] for seq in true_label_ids]
pred_tags = [[id2label[id_] for id_ in seq] for seq in pred_label_ids]

print("\n=== Span-level Metrics ===")
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("Precision:", precision_score(true_tags, pred_tags))
print("Recall:   ", recall_score(true_tags, pred_tags))
print("F1:       ", f1_score(true_tags, pred_tags))
print("\n", classification_report(true_tags, pred_tags))

# 6.2 Qualitative Inference via extract_fields()
ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

def extract_fields(text: str) -> dict:
    entities = ner_pipe(text)
    fields = defaultdict(list)
    for ent in entities:
        eg = ent.get("entity_group", ent.get("entity"))
        if not eg: continue
        parts = eg.split("-", 1)
        fld = parts[1] if len(parts)==2 else parts[0]
        fields[fld].append(ent["word"])
    return {fld: " ".join(tokens) for fld, tokens in fields.items()}

print("\n=== Qualitative Inference Results ===")
for fname, raw_text in texts.items():
    print(f"--- {fname} ---")
    res = extract_fields(raw_text)
    if not res:
        print(" (no entities found)\n")
    else:
        for fld, val in res.items():
            print(f" {fld}: {val}")
        print()

=== Trainer.evaluate() results ===
{'eval_loss': 0.01835673674941063, 'eval_runtime': 1.0327, 'eval_samples_per_second': 12.588, 'eval_steps_per_second': 3.873, 'epoch': 3.0}


Device set to use mps:0



=== Span-level Metrics ===
Precision: 0.6486486486486487
Recall:    0.8275862068965517
F1:        0.7272727272727273

                precision    recall  f1-score   support

 Company_Name       0.65      0.87      0.74        23
Document_Type       0.67      0.67      0.67         6

    micro avg       0.65      0.83      0.73        29
    macro avg       0.66      0.77      0.70        29
 weighted avg       0.65      0.83      0.73        29


=== Qualitative Inference Results ===
--- 816_2020-12-21_Certificates of Incorporation.txt ---
 Company_Name: automattic inc. prosp .

--- 503_2009-08-13_Certificates of Incorporation.txt ---
 (no entities found)

--- 625_2002-05-01_Certificates of Incorporation.txt ---
 Company_Name: aravo solutions inc.

--- 513_2007-10-05_Certificates of Incorporation.txt ---
 Company_Name: anagran inc

--- 774_2003-09-16_Certificates of Incorporation.txt ---
 Company_Name: .

--- 364_2009-08-25_Certificates of Incorporation.txt ---
 Company_Name: alimer

In [8]:
# Section 7: Build True vs. Predicted DataFrame
candidate_fields = [
    'Company Name', 'Date', 'Document Type',
    'Preferred Stocks', 'Priority Order', 'Liquidation Value'
]
orig_fields  = [f for f in candidate_fields if f in df.columns]
label_fields = [f.replace(' ', '_') for f in orig_fields]

file_names, _ = train_test_split(df['fname'].tolist(), test_size=0.9, random_state=42)
eval_files    = file_names

rows = []
for fname in eval_files:
    true_row = df[df['fname'] == fname].iloc[0]
    pred     = extract_fields(texts[fname])
    row = {'fname': fname}
    for orig, lab in zip(orig_fields, label_fields):
        row[f"{lab}_true"] = true_row.get(orig, "")
        row[f"{lab}_pred"] = pred.get(lab, "")
    rows.append(row)

df_eval = pd.DataFrame(rows)
display(df_eval)

Unnamed: 0,fname,Company_Name_true,Company_Name_pred,Date_true,Date_pred,Document_Type_true,Document_Type_pred
0,223_2007-08-03_Certificates of Incorporation.txt,,advion biosciences inc. .,,,,
1,192_2005-09-27_Certificates of Incorporation.txt,"Advanced BioHealing, Inc.",advanced biohealing inc. .,2005-09-27,,Amended and Restated Certificate of Incorporation,
2,189_2005-12-20_Certificates of Incorporation.txt,"Adspace Networks, Inc.",. .,2005-12-20,,Amended and Restated Certificate of Incorporation,
3,200_2008-08-22_Certificates of Incorporation.txt,"Advanced Electron Beams, Inc.",advanced electron beams inc.,2008-08-22,,Amended and Restated Certificate of Incorporation,amended and restated certificate of incorporation
4,188_2010-11-08_Certificates of Incorporation.txt,"Semantic Sugar, Inc.",semantic sugar inc.,2010-11-08,,Certificate of Amendment to the Restated Certi...,
5,181_2007-10-29_Certificates of Incorporation.txt,"Adknowledge, Inc.",adknowledge inc.,2007-10-29,,Amended and Restated Certificate of Incorporat...,
6,200_2013-01-30_Certificates of Incorporation.txt,"Advanced Electron Beams, Inc.",advanced electron beams inc.,2013-01-30,,Certificate of Dissolution,
7,234_2012-02-14_Certificates of Incorporation.txt,,aerohive networks inc. ##oh networks,,,,
8,136_2007-02-14_Certificates of Incorporation.txt,"Actmis Pharamaceuticals, Inc.",. inc.,2007-02-14,,Amended and Restated Certificate of Incorporation,
9,169_2011-08-16_Certificates of Incorporation.txt,"Adchemy, Inc.",adchemy inc.,2011-08-16,,Amended and Restated Certificate of Incorporation,
