In [None]:
import os
import json
from google.cloud import storage
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
import torch
import pandas as pd
from tqdm import tqdm
import re

In [None]:
os.environ["HF_DATASETS_CACHE"] = "/mnt/disks/data/hf_cache"
os.environ["HF_DATASETS_OFFLINE"] = "1"  # Optional: if you're only working locally
os.environ["TRANSFORMERS_CACHE"] = "/mnt/disks/data/transformers_cache"
os.environ["TMPDIR"] = "/mnt/disks/data/tmp"

In [None]:
# Detect number of GPUs
gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {gpus}")
print(torch.cuda.get_device_name(0))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Setup
BUCKET_NAME = "diss_market_data"
MODEL_PREFIX = "finbert-finetuned_onlyfilings/"
MODEL_LOCAL_DIR = "finbert_onlyfilings"
# DATASET_PREFIX = "edgar-corpus-full/"
DATASET_LOCAL_DIR = "Data/10K_combined_dataset.csv"
# CIK_PATH = "./Data/FILINGS_METADATA.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores.csv"
SECTIONS = ["section_1A", "section_7"]
# YEARS = set(["2006","2007","2008","2009","2010","2011","2012","2013"])
CHUNK_SIZE = 512
CHUNK_STRIDE = 128

In [None]:
# ------------------------ Load Tickers ---------------------------
# metadata = pd.read_csv(CIK_PATH)
# CIK_SET = set(metadata['CIK'].astype(str).str.strip())

In [None]:
# ---------------------- GCS Download Helpers ---------------------
def download_from_gcs(bucket_name, prefix, local_dir):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    for blob in blobs:
        if blob.name.endswith('/'):  # Skip folders
            continue
        rel_path = os.path.relpath(blob.name, prefix)
        local_path = os.path.join(local_dir, rel_path)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        blob.download_to_filename(local_path)

# Download model if not present
if not os.path.exists(os.path.join(MODEL_LOCAL_DIR, "pytorch_model.bin")):
    download_from_gcs(BUCKET_NAME, MODEL_PREFIX, MODEL_LOCAL_DIR)

# Download dataset if not present
# if not os.listdir(DATASET_LOCAL_DIR):
#     download_from_gcs(BUCKET_NAME, DATASET_PREFIX, DATASET_LOCAL_DIR)

In [None]:
# ------------------------ Load Model -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_LOCAL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_LOCAL_DIR).to(DEVICE)
model.eval()

# ---------------------- Sentiment Logic --------------------------

In [None]:
def get_section_chunks(text, max_len=512, stride=128):
    tokens = tokenizer(text, truncation=False, padding=False, return_tensors='pt')['input_ids'][0]
    chunks = [tokens[i:i+max_len] for i in range(0, len(tokens), max_len - stride)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [None]:
def batched_predict(chunks):
    inputs = tokenizer(chunks, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}  # ✅ ensure proper device
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()
    labels = probs.argmax(axis=1)
    label_map = ['negative', 'neutral', 'positive']
    results = []
    for i, score in enumerate(probs):
        label = label_map[labels[i]]
        scaled_score = max(score) * (1 if label == 'positive' else -1 if label == 'negative' else 0)
        results.append((scaled_score, label))
    return results

In [None]:
splits = ["train", "test", "validation"]

In [None]:
dataset = load_dataset(DATASET_LOCAL_DIR)

In [None]:
# # assume dataset['train'] is a list of dicts all sharing the same keys
# keys = dataset['train'][0].keys()

# for key in keys:
#     for example in dataset['train']:
#         val = example.get(key)
#         # skip empty strings (and non‑string values, if you only care about str)
#         if isinstance(val, str) and val != '':
#             print(f"{key!r}: {val!r}")
#             break
#     else:
#         # no non-empty string found for this key
#         print(f"{key!r}: (all values empty)")


In [None]:
def process_record(record):
    cik = record.get('cik', '').strip()
    year = record.get('year', '')

    if cik not in CIK_SET or year not in YEARS:
        return []
    
    results = []
    for section in SECTIONS:
        # print(section)
        text = record.get(section, '').strip()
        if not text:
            # print(f'No Text: {text} :: {section}')
            continue

        chunks = get_section_chunks(text)
        if not chunks:
            continue

        sentiments = batched_predict(chunks)
        scores = [s for s, _ in sentiments]
        labels = [l for _, l in sentiments]

        avg_score = sum(scores) / len(scores)
        majority_label = max(set(labels), key=labels.count)

        results.append({
            'CIK': cik,
            'Type': '10K',
            'Year': year,
            'Section': section,
            'Sentiment Score': avg_score,
            'Sentiment Label': majority_label
        })
    return results

In [None]:
# ------------------ Load & Stream Dataset -------------------------
splits = ["train", "test", "validation"]
all_results = []

for split in splits:
    ds = dataset[split]
    for record in tqdm(ds, desc=f"Processing {split}"):
        results = process_record(record)
        all_results.extend(results)
        if len(all_results) == 1:
            print(all_results)

In [None]:
len(all_results)

In [None]:
# ------------------ Save Results ----------------------------------
df = pd.DataFrame(all_results)
df.to_csv(OUTPUT_CSV_PATH, index=False)

## 10-Q

In [None]:
# Setup
DATASET_LOCAL_DIR = "./Data/10Q_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_10Q.csv"
SECTIONS = ["part_1_item_2", "part_2_item_1A"]

In [None]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

In [None]:
dataset.columns

In [None]:
# ----------- Main Scoring Function for 10-Q, 8-K and 10-K -----------
def process_dataframe(df):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        cik = str(row['cik']).strip()
        filing_date = row['filing_date']
        filing_type = row['filing_type']

        for section in SECTIONS:
            text = row.get(section, '')
            if not isinstance(text, str) or not text.strip():
                continue

            chunks = get_section_chunks(text)
            if not chunks:
                continue

            sentiments = batched_predict(chunks)
            scores = [s for s, _ in sentiments]
            labels = [l for _, l in sentiments]

            avg_score = sum(scores) / len(scores)
            majority_label = max(set(labels), key=labels.count)

            results.append({
                'CIK': cik,
                'Type': filing_type,
                'Filing Date': filing_date,
                'Section': section,
                'Sentiment Score': avg_score,
                'Sentiment Label': majority_label
            })

    return pd.DataFrame(results)

In [None]:
# ----------- Calling Function -----------
results_10q = process_dataframe(dataset)

In [None]:
results_10q.head()

In [None]:
# ----------- Saving Result -----------
results_10q.to_csv(OUTPUT_CSV_PATH, index=False)

## 8-K

In [None]:
# Setup
DATASET_LOCAL_DIR = "./Data/8K_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_8K.csv"
SECTIONS = [
    "item_2.02",  # Results of Operations and Financial Condition
    "item_8.01",  # Other Events
    "item_1.01",  # Entry into a Material Definitive Agreement
    "item_2.01",  # Completion of Acquisition or Disposition of Assets
    "item_5.02",  # Departure/Election of Directors or Officers; Compensatory Arrangements
    "item_2.05",  # Costs Associated with Exit or Disposal Activities
    "item_1.03",  # Bankruptcy or Receivership
]

In [None]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

In [None]:
dataset.columns

In [None]:
# ----------- Calling Function -----------
results_8k = process_dataframe(dataset)

In [None]:
# ----------- Saving Result -----------
results_8k.to_csv(OUTPUT_CSV_PATH, index=False)

# 10-K

In [None]:
# Setup
DATASET_LOCAL_DIR = "./Data/10K_combined_dataset.csv"
OUTPUT_CSV_PATH = "./Data/aggregated_sentiment_scores_10K_new.csv"
SECTIONS = ["item_1A", "item_7"]

In [None]:
# Load Dataset
dataset = pd.read_csv(DATASET_LOCAL_DIR)
dataset.head()

In [None]:
# ----------- Calling Function -----------
results_10k = process_dataframe(dataset)

In [None]:
# ----------- Saving Result -----------
results_8k.to_csv(OUTPUT_CSV_PATH, index=False)