# 🧠 Finance QA + Sentiment LLM (Training Pipeline)
This notebook fine-tunes a small LLM (DistilBERT) on two tasks:
- Financial Question Answering (QA)
- Financial Sentiment Classification

Runs on Google Colab free tier (8GB RAM).

In [None]:
# 📦 Install dependencies
!pip install -q transformers datasets scikit-learn pandas
!pip install -q accelerate bitsandbytes

In [None]:
# 📁 Mount Google Drive if needed (optional)
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# 🔧 Imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from datasets import Dataset, concatenate_datasets

In [None]:
# 📊 Load the datasets
qa_df = pd.read_csv("/content/data/financial_qa_dataset.csv")
sent_df = pd.read_csv("/content/data/sentiment_dataset.csv")

In [None]:
# 🧼 Preprocess QA data
qa_df = qa_df.rename(columns={"question": "text", "answer": "label"})
qa_df["label"] = qa_df["label"].astype(int)

# 🧼 Preprocess sentiment data
sent_df = sent_df.rename(columns={"sentence": "text", "sentiment": "label"})
label_map = {"negative": 0, "neutral": 1, "positive": 2}
sent_df["label"] = sent_df["label"].map(label_map)

# 🧩 Combine and shuffle
full_df = pd.concat([qa_df, sent_df]).sample(frac=1).reset_index(drop=True)
dataset = Dataset.from_pandas(full_df)

In [None]:
# ✂️ Tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
# 📦 Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# ⚙️ Training setup
args = TrainingArguments(
    output_dir="/content/checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="/content/logs",
    logging_steps=10,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [None]:
# 🏋️ Train!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(100)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()