In [29]:
import os
from typing import Tuple

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW          # ← NEW: AdamW now comes from torch.optim
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,  # ← keep this from transformers
)

## DATASET

In [30]:
class EtfSentimentDataset(Dataset):
    """
    Dataset for ETF sentiment regression.
    Uses: ticker + Title -> [score, pos, neg, neutral]
    """

    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int = 128):
        # df MUST have: 'Title', 'ticker',
        #              'Sentiment_score', 'Sentiment_positive',
        #              'Sentiment_negative', 'Sentiment_neutral'
        self.tokenizer = tokenizer
        self.max_length = max_length

        combined = df["ticker"].astype(str) + " : " + df["title"].astype(str)
        self.texts = combined.tolist()

        self.labels = df[
            ["sentiment_score",
             "sentiment_positive",
             "sentiment_negative",
             "sentiment_neutral"]
        ].astype(float).values  # shape: (N, 4)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        label_vec = self.labels[idx]   # shape: (4,)

        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            # 4-dim float tensor
            "labels": torch.tensor(label_vec, dtype=torch.float),
        }


## DATA LOADING

In [31]:
def load_article_csv(csv_path: str, ticker: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    # OPTIONAL: date handling if you need it
    if ticker == "QQQ":
        df = df[df["date"] >= "2021-01-01"]


    required_cols = [
        "title",
        "sentiment_score",
        "sentiment_positive",
        "sentiment_negative",
        "sentiment_neutral",
    ]

    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise KeyError(
            f"Missing columns in {csv_path}: {missing}\n"
            f"Available columns: {list(df.columns)}"
        )

    # Drop rows with missing sentiment/Title
    df = df.dropna(subset=required_cols)

    # ---- NEW: drop clearly non-English titles (for XLI or for all) ----
    def is_basic_english(s: str) -> bool:
        if not isinstance(s, str):
            return False
        # keep titles where all chars are basic ASCII
        return all(ord(c) < 128 for c in s)

    # If you ONLY want this for XLI:
    if ticker == "XLI":
        df = df[df["title"].apply(is_basic_english)]

    # If you decide later you want it for all tickers, just remove the `if` and do:
    # df = df[df["Title"].apply(is_basic_english)]

    df["ticker"] = ticker
    return df


def create_dataloaders(
    df: pd.DataFrame,
    tokenizer,
    batch_size: int = 16,
    max_length: int = 128,
    val_split: float = 0.1,
) -> Tuple[DataLoader, DataLoader]:
    dataset = EtfSentimentDataset(df, tokenizer, max_length=max_length)

    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader


## MODEL BUILDING

In [32]:
def build_model(model_name: str = "bert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=4,   # <-- 4 outputs now
    )
    model.config.problem_type = "regression"
    return tokenizer, model

## TRAIN/EVAL LOOPS

In [33]:
def train_one_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0

    for batch in loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    total_mae = 0.0
    n = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)  # shape: (B, 4)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            loss = outputs.loss              # averaged over batch+labels
            preds = outputs.logits           # shape: (B, 4)

            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            # MAE per sample over 4 dims, then average over batch
            batch_mae = torch.abs(preds - labels).mean(dim=1)  # (B,)
            total_mae += batch_mae.sum().item()

            n += batch_size

    return total_loss / n, total_mae / n

## High-level train & inference

In [34]:
def train_model_for_etf(
    csv_path: str,
    ticker: str,
    model_name: str = "bert-base-uncased",
    batch_size: int = 16,
    lr: float = 2e-5,
    epochs: int = 3,
    max_length: int = 128,
    output_dir: str = "./etf_sentiment_model_XLY",
):
    """
    Train language model on ONE ETF's article file.
    Pass a different csv_path + ticker + output_dir per ETF.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    df = load_article_csv(csv_path, ticker)
    tokenizer, model = build_model(model_name)
    train_loader, val_loader = create_dataloaders(
        df, tokenizer, batch_size=batch_size, max_length=max_length
    )

    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps,
    )

    best_val = float("inf")

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        train_loss = train_one_epoch(
            model, train_loader, optimizer, scheduler, device
        )
        val_loss, val_mae = evaluate(model, val_loader, device)
        print(f"Train loss: {train_loss:.4f}")
        print(f"Val loss: {val_loss:.4f} | MAE: {val_mae:.4f}")

        if val_loss < best_val:
            best_val = val_loss
            os.makedirs(output_dir, exist_ok=True)
            model.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            print("Saved best model to", output_dir)

    print("Done.")


def load_trained_model(model_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.config.problem_type = "regression"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    return tokenizer, model, device


def predict_sentiment_for_text(
    text: str,
    ticker: str,
    model_dir: str,
    max_length: int = 128,
):
    """
    Returns a dict with:
      - score
      - positive
      - negative
      - neutral
    """
    tokenizer, model, device = load_trained_model(model_dir)

    combined = f"{ticker} : {text}"
    enc = tokenizer(
        combined,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )

    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        # logits: shape (1, 4)
        vec = outputs.logits.squeeze(0).tolist()

    return {
        "score": float(vec[0]),
        "positive": float(vec[1]),
        "negative": float(vec[2]),
        "neutral": float(vec[3]),
    }


if __name__ == "__main__":
    # Example: train on your XLY file
    # train_model_for_etf(
    #     csv_path="data/raw/gdelt_articles_XLY.csv",
    #     ticker="XLY",
    #     output_dir="./etf_sentiment_model_XLY",
    # )

    # Example: inference after training
    # s = predict_sentiment_for_text(
    #     "I think Trump will lower tariffs",
    #     ticker="XLY",
    #     model_dir="./etf_sentiment_model_XLY",
    # )
    # print("Predicted sentiment:", s)
    pass

## TESTING

In [35]:
etf_tickers = ["QQQ"]  # <-- put your 4 tickers here

for ticker in etf_tickers:
    csv_path = f"../data/raw/gdelt_articles_{ticker}.csv"
    output_dir = f"./sentiment_score_{ticker}"

    print(f"\n========== Training model for {ticker} ==========")
    train_model_for_etf(
        csv_path=csv_path,
        ticker=ticker,
        model_name="bert-base-uncased",
        batch_size=16,          # adjust if you want
        lr=2e-5,
        epochs=2,              # or 3–4 if it’s still improving
        max_length=128,
        output_dir=output_dir,
    )


Device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/2
Train loss: 0.0448
Val loss: 0.0124 | MAE: 0.0717
Saved best model to ./sentiment_score_QQQ

Epoch 2/2
Train loss: 0.0112
Val loss: 0.0093 | MAE: 0.0592
Saved best model to ./sentiment_score_QQQ
Done.


In [None]:
df = pd.read_csv("../data/raw/gdelt_articles_XLI.csv")
df.shape

In [None]:
csv_path = "../data/raw/gdelt_articles_XLY.csv" # or just "gdelt_articles_XLY.csv"
ticker = "XLY"

train_model_for_etf(
    csv_path=csv_path,
    ticker=ticker,
    model_name="bert-base-uncased",        # fine for now
    batch_size=16,
    lr=2e-5,
    epochs=2,                              # start small to test
    max_length=128,
    output_dir="./sentiment_score" # folder where model will be saved
)

Device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/2
Train loss: 0.0661
Val loss: 0.0332 | MAE: 0.1274
Saved best model to ./sentiment_score

Epoch 2/2
Train loss: 0.0371
Val loss: 0.0303 | MAE: 0.1172
Saved best model to ./sentiment_score
Done.


In [None]:
result = predict_sentiment_for_text(
    text="again more gains for clients",
    ticker="XLY",
    model_dir="./sentiment_score",
)
print(result)

{'score': -0.006985791027545929, 'positive': 0.07823057472705841, 'negative': 0.03691700100898743, 'neutral': 0.8261308073997498}


In [None]:
os.path.abspath("./sentiment_score")

'/Users/tiamathur/ETF-Sentiment-TechTreks/notebooks/sentiment_score'