In [4]:
import os
import json
from google.cloud import storage
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
import torch
import pandas as pd
from tqdm import tqdm
import re

In [5]:
os.environ["HF_DATASETS_CACHE"] = "/mnt/disks/data/hf_cache"
os.environ["HF_DATASETS_OFFLINE"] = "1"  # Optional: if you're only working locally
os.environ["TRANSFORMERS_CACHE"] = "/mnt/disks/data/transformers_cache"
os.environ["TMPDIR"] = "/mnt/disks/data/tmp"

In [6]:
# Detect number of GPUs
gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {gpus}")
print(torch.cuda.get_device_name(0))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Number of GPUs available: 1
NVIDIA A100-SXM4-40GB


In [7]:
# Setup
BUCKET_NAME = "diss_market_data"
MODEL_PREFIX = "finbert-finetuned_onlyfilings/"
MODEL_LOCAL_DIR = "finbert_onlyfilings"
# DATASET_PREFIX = "edgar-corpus-full/"
DATASET_LOCAL_DIR = "Data/10K_combined_dataset.csv"
# CIK_PATH = "./Data/FILINGS_METADATA.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores.csv"
SECTIONS = ["section_1A", "section_7"]
# YEARS = set(["2006","2007","2008","2009","2010","2011","2012","2013"])
CHUNK_SIZE = 512
CHUNK_STRIDE = 128

In [10]:
# ------------------------ Load Tickers ---------------------------
# metadata = pd.read_csv(CIK_PATH)
# CIK_SET = set(metadata['CIK'].astype(str).str.strip())

In [11]:
# ---------------------- GCS Download Helpers ---------------------
def download_from_gcs(bucket_name, prefix, local_dir):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    for blob in blobs:
        if blob.name.endswith('/'):  # Skip folders
            continue
        rel_path = os.path.relpath(blob.name, prefix)
        local_path = os.path.join(local_dir, rel_path)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        blob.download_to_filename(local_path)

# Download model if not present
if not os.path.exists(os.path.join(MODEL_LOCAL_DIR, "pytorch_model.bin")):
    download_from_gcs(BUCKET_NAME, MODEL_PREFIX, MODEL_LOCAL_DIR)

# Download dataset if not present
# if not os.listdir(DATASET_LOCAL_DIR):
#     download_from_gcs(BUCKET_NAME, DATASET_PREFIX, DATASET_LOCAL_DIR)

In [12]:
# ------------------------ Load Model -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_LOCAL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_LOCAL_DIR).to(DEVICE)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# ---------------------- Sentiment Logic --------------------------

In [22]:
def get_section_chunks(text, max_len=512, stride=128):
    tokens = tokenizer(text, truncation=False, padding=False, return_tensors='pt')['input_ids'][0]
    chunks = [tokens[i:i+max_len] for i in range(0, len(tokens), max_len - stride)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [23]:
def batched_predict(chunks):
    inputs = tokenizer(chunks, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}  # ✅ ensure proper device
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()
    labels = probs.argmax(axis=1)
    label_map = ['negative', 'neutral', 'positive']
    results = []
    for i, score in enumerate(probs):
        label = label_map[labels[i]]
        scaled_score = max(score) * (1 if label == 'positive' else -1 if label == 'negative' else 0)
        results.append((scaled_score, label))
    return results

In [26]:
splits = ["train", "test", "validation"]

In [8]:
dataset = load_dataset(DATASET_LOCAL_DIR)

Resolving data files:   0%|          | 0/66 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/54 [00:00<?, ?it/s]

In [None]:
# # assume dataset['train'] is a list of dicts all sharing the same keys
# keys = dataset['train'][0].keys()

# for key in keys:
#     for example in dataset['train']:
#         val = example.get(key)
#         # skip empty strings (and non‑string values, if you only care about str)
#         if isinstance(val, str) and val != '':
#             print(f"{key!r}: {val!r}")
#             break
#     else:
#         # no non-empty string found for this key
#         print(f"{key!r}: (all values empty)")


In [129]:
def process_record(record):
    cik = record.get('cik', '').strip()
    year = record.get('year', '')

    if cik not in CIK_SET or year not in YEARS:
        return []
    
    results = []
    for section in SECTIONS:
        # print(section)
        text = record.get(section, '').strip()
        if not text:
            # print(f'No Text: {text} :: {section}')
            continue

        chunks = get_section_chunks(text)
        if not chunks:
            continue

        sentiments = batched_predict(chunks)
        scores = [s for s, _ in sentiments]
        labels = [l for _, l in sentiments]

        avg_score = sum(scores) / len(scores)
        majority_label = max(set(labels), key=labels.count)

        results.append({
            'CIK': cik,
            'Type': '10K',
            'Year': year,
            'Section': section,
            'Sentiment Score': avg_score,
            'Sentiment Label': majority_label
        })
    return results

In [130]:
# ------------------ Load & Stream Dataset -------------------------
splits = ["train", "test", "validation"]
all_results = []

for split in splits:
    ds = dataset[split]
    for record in tqdm(ds, desc=f"Processing {split}"):
        results = process_record(record)
        all_results.extend(results)
        if len(all_results) == 1:
            print(all_results)

Processing train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 176289/176289 [40:14<00:00, 73.02it/s]
Processing test: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 22036/22036 [04:57<00:00, 74.06it/s]
Processing validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 22050/22050 [05:21<00:00, 68.61it/s]


In [132]:
len(all_results)

14255

In [135]:
# ------------------ Save Results ----------------------------------
df = pd.DataFrame(all_results)
df.to_csv(OUTPUT_CSV_PATH, index=False)

## 10-Q

In [13]:
# Setup
DATASET_LOCAL_DIR = "./Data/10Q_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_10Q.csv"
SECTIONS = ["part_1_item_2", "part_2_item_1A"]

In [14]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

Unnamed: 0,cik,company,filing_type,filing_date,period_of_report,sic,state_of_inc,state_location,fiscal_year_end,filing_html_index,...,part_1_item_3,part_1_item_4,part_2,part_2_item_1,part_2_item_1A,part_2_item_2,part_2_item_3,part_2_item_4,part_2_item_5,part_2_item_6
0,1000045,NICHOLAS FINANCIAL INC,10-Q,2007-02-14,2006-12-31,6153.0,FL,FL,331.0,https://www.sec.gov/Archives/edgar/data/100004...,...,ITEM 3.\nQUANTITATIVE AND QUALITATIVE DISCLOSU...,ITEM 4.\nCONTROLS AND PROCEDURES\nEvaluation o...,PART II- OTHER INFORMATION\nITEM 1A.\nRISK FAC...,,ITEM 1A.\nRISK FACTORS\nIn addition to the oth...,,,,,ITEM 6.\nEXHIBITS\nSee exhibit index following...
1,1000230,OPTICAL CABLE CORP,10-Q,2007-03-19,2007-01-31,3357.0,VA,VA,1031.0,https://www.sec.gov/Archives/edgar/data/100023...,...,Item 3. Quantitative and Qualitative Disclosur...,Item 4. Controls and Procedures\nOur managemen...,PART II. OTHER INFORMATION\nItem 1A. Risk Fact...,,Item 1A. Risk Factors\nThere are a number of b...,,,,,Item 6. Exhibits\nThe exhibits listed on the E...
2,1000697,WATERS CORP /DE/,10-Q,2007-05-04,2007-03-31,3826.0,DE,MA,1231.0,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 3: Quantitative and Qualitative Disclosur...,Item 4: Controls and Procedures\n(a)\nEvaluati...,Part II: Other Information\nItem 1: Legal Proc...,Item 1: Legal Proceedings\nThere have been no ...,Item 1A: Risk Factors\nPlease read “Risk facto...,Item 2: Unregistered Sales of Equity Securitie...,Item 3: Defaults Upon Senior Securities\nNot A...,Item 4: Submission of Matters to a Vote of Sec...,Item 5: Other Information\nNot Applicable,Item 6: Exhibits\nExhibit\nNumber\nDescription...
3,1000697,WATERS CORP /DE/,10-Q,2007-08-03,2007-06-30,3826.0,DE,MA,1231.0,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 3: Quantitative and Qualitative Disclosur...,Item 4: Controls and Procedures\n(a) Evaluatio...,Part II: Other Information\nItem 1: Legal Proc...,Item 1: Legal Proceedings\nThere have been no ...,Item 1A: Risk Factors\nPlease read “Risk facto...,Item 2: Unregistered Sales of Equity Securitie...,Item 3: Defaults Upon Senior Securities\nNot A...,Item 4: Submission of Matters to a Vote of Sec...,Item 5: Other Information\nNot Applicable,Item 6: Exhibits\nExhibit\nNumber\nDescription...
4,1000697,WATERS CORP /DE/,10-Q,2007-11-02,2007-09-29,3826.0,DE,MA,1231.0,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 3: Quantitative and Qualitative Disclosur...,Item 4: Controls and Procedures\n(a) Evaluatio...,Part II: Other Information\nItem 1: Legal Proc...,Item 1: Legal Proceedings\nThere have been no ...,Item 1A: Risk Factors\nPlease read “Risk facto...,Item 2: Unregistered Sales of Equity Securitie...,Item 3: Defaults Upon Senior Securities\nNot A...,Item 4: Submission of Matters to a Vote of Sec...,Item 5: Other Information\nNot Applicable,Item 6: Exhibits\nExhibit\nNumber\nDescription...


In [32]:
dataset.columns

Index(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report',
       'sic', 'state_of_inc', 'state_location', 'fiscal_year_end',
       'filing_html_index', 'htm_filing_link', 'complete_text_filing_link',
       'filename', 'part_1', 'part_1_item_1', 'part_1_item_2', 'part_1_item_3',
       'part_1_item_4', 'part_2', 'part_2_item_1', 'part_2_item_1A',
       'part_2_item_2', 'part_2_item_3', 'part_2_item_4', 'part_2_item_5',
       'part_2_item_6'],
      dtype='object')

In [14]:
# ----------- Main Scoring Function for 10-Q, 8-K and 10-K -----------
def process_dataframe(df):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        cik = str(row['cik']).strip()
        filing_date = row['filing_date']
        filing_type = row['filing_type']

        for section in SECTIONS:
            text = row.get(section, '')
            if not isinstance(text, str) or not text.strip():
                continue

            chunks = get_section_chunks(text)
            if not chunks:
                continue

            sentiments = batched_predict(chunks)
            scores = [s for s, _ in sentiments]
            labels = [l for _, l in sentiments]

            avg_score = sum(scores) / len(scores)
            majority_label = max(set(labels), key=labels.count)

            results.append({
                'CIK': cik,
                'Type': filing_type,
                'Filing Date': filing_date,
                'Section': section,
                'Sentiment Score': avg_score,
                'Sentiment Label': majority_label
            })

    return pd.DataFrame(results)

In [37]:
# ----------- Calling Function -----------
results_10q = process_dataframe(dataset)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7235/7235 [33:45<00:00,  3.57it/s]


In [40]:
results_10q.head()

Unnamed: 0,CIK,Type,Filing Date,Section,Sentiment Score,Sentiment Label
0,1000045,10-Q,2007-02-14,part_1_item_2,-0.217572,negative
1,1000045,10-Q,2007-02-14,part_2_item_1A,0.887953,positive
2,1000230,10-Q,2007-03-19,part_1_item_2,-0.209962,negative
3,1000230,10-Q,2007-03-19,part_2_item_1A,0.999332,positive
4,1000697,10-Q,2007-05-04,part_1_item_2,-0.155652,negative


In [41]:
# ----------- Saving Result -----------
results_10q.to_csv(OUTPUT_CSV_PATH, index=False)

## 8-K

In [51]:
# Setup
DATASET_LOCAL_DIR = "./Data/8K_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_8K.csv"
SECTIONS = [
    "item_2.02",  # Results of Operations and Financial Condition
    "item_8.01",  # Other Events
    "item_1.01",  # Entry into a Material Definitive Agreement
    "item_2.01",  # Completion of Acquisition or Disposition of Assets
    "item_5.02",  # Departure/Election of Directors or Officers; Compensatory Arrangements
    "item_2.05",  # Costs Associated with Exit or Disposal Activities
    "item_1.03",  # Bankruptcy or Receivership
]

In [48]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

  dataset = pd.read_csv(DATASET_LOCAL_DIR)


Unnamed: 0,cik,company,filing_type,filing_date,period_of_report,sic,state_of_inc,state_location,fiscal_year_end,filing_html_index,...,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01
0,1000045,NICHOLAS FINANCIAL INC,8-K,2007-01-29,2007-01-29,6153.0,FL,FL,331.0,https://www.sec.gov/Archives/edgar/data/100004...,...,,,,,,,,,,Item 9.01 Financial Statements and Exhibits\nE...
1,1000180,SANDISK CORP,8-K,2007-02-21,2007-02-15,3572.0,DE,CA,1231.0,https://www.sec.gov/Archives/edgar/data/100018...,...,,,,,,,,,,
2,1000180,SANDISK CORP,8-K,2007-01-30,2007-01-30,3572.0,DE,CA,1231.0,https://www.sec.gov/Archives/edgar/data/100018...,...,,,,,,,,,,Item 9.01 Financial Statements and Exhibits\n(...
3,1000209,MEDALLION FINANCIAL CORP,8-K,2007-01-17,2007-01-16,6199.0,DE,NY,1231.0,https://www.sec.gov/Archives/edgar/data/100020...,...,,,,,,,,,,ITEM 9.01. FINANCIAL STATEMENTS AND EXHIBITS.\...
4,1000209,MEDALLION FINANCIAL CORP,8-K,2007-03-19,2007-03-19,6199.0,DE,NY,1231.0,https://www.sec.gov/Archives/edgar/data/100020...,...,,,,,,,,,ITEM 8.01\nOTHER EVENTS\nThe Company has adopt...,ITEM 9.01\nFINANCIAL STATEMENTS AND EXHIBITS.\...


In [49]:
dataset.columns

Index(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report',
       'sic', 'state_of_inc', 'state_location', 'fiscal_year_end',
       'filing_html_index', 'htm_filing_link', 'complete_text_filing_link',
       'filename', 'item_1.01', 'item_1.02', 'item_1.03', 'item_1.04',
       'item_1.05', 'item_2.01', 'item_2.02', 'item_2.03', 'item_2.04',
       'item_2.05', 'item_2.06', 'item_3.01', 'item_3.02', 'item_3.03',
       'item_4.01', 'item_4.02', 'item_5.01', 'item_5.02', 'item_5.03',
       'item_5.04', 'item_5.05', 'item_5.06', 'item_5.07', 'item_5.08',
       'item_6.01', 'item_6.02', 'item_6.03', 'item_6.04', 'item_6.05',
       'item_7.01', 'item_8.01', 'item_9.01'],
      dtype='object')

In [50]:
# ----------- Calling Function -----------
results_8k = process_dataframe(dataset)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39121/39121 [10:15<00:00, 63.54it/s]


In [52]:
# ----------- Saving Result -----------
results_8k.to_csv(OUTPUT_CSV_PATH, index=False)

# 10-K

In [20]:
# Setup
DATASET_LOCAL_DIR = "./Data/10K_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_10K_new.csv"
SECTIONS = ["item_1A", "item_7"]

In [16]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

Unnamed: 0,cik,company,filing_type,filing_date,period_of_report,sic,state_of_inc,state_location,fiscal_year_end,filing_html_index,...,item_9A,item_9B,item_9C,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,1000697,WATERS CORP /DE/,10-K,2007-03-01,2006-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9a:\nControls and Procedures\n(a)\nEvalua...,Item 9b:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
1,1000697,WATERS CORP /DE/,10-K,2008-02-29,2007-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
2,1000697,WATERS CORP /DE/,10-K,2009-02-27,2008-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,Item 15:\nExhibits and Financial Statement Sch...,
3,1000697,WATERS CORP /DE/,10-K,2010-02-26,2009-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,"Item 15:\nExhibits, Financial Statement Schedu...",
4,1000697,WATERS CORP /DE/,10-K,2011-02-25,2010-12-31,3826,DE,MA,1231,https://www.sec.gov/Archives/edgar/data/100069...,...,Item 9A:\nControls and Procedures\nEvaluation ...,Item 9B:\nOther Information\nNone.\nPART III,,"Item 10:\nDirectors, Executive Officers and Co...",Item 11:\nExecutive Compensation\nThis informa...,Item 12:\nSecurity Ownership of Certain Benefi...,Item 13:\nCertain Relationships and Related Tr...,Item 14:\nPrincipal Accountant Fees and Servic...,"Item 15:\nExhibits, Financial Statement Schedu...",


In [24]:
# ----------- Calling Function -----------
results_10k = process_dataframe(dataset)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2341/2341 [18:58<00:00,  2.06it/s]


In [25]:
# ----------- Saving Result -----------
results_8k.to_csv(OUTPUT_CSV_PATH, index=False)

NameError: name 'results_8k' is not defined