# Model Training: Fine-Tuning RoBERTa

## Overview
We use the synthetic data generated in Notebook 1 to fine-tune a `distilroberta-base` model. 
This teaches the model to classify financial sentences as **Specific (1)** or **Vague (0)**.

## Steps
1. Load the tokenizer and model.
2. Tokenize the synthetic dataset.
3. Train using the Hugging Face `Trainer` API.
4. Save the fine-tuned model to `../models/gw_finetuned`.

In [None]:
import os
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Disable wandb logging to keep output clean
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the data created in 1_Data_Prep_Synthetic.ipynb
data_files = {"train": "../inputs/train_synthetic.csv", "test": "../inputs/eval_synthetic.csv"}
dataset = load_dataset("csv", data_files=data_files)

print("Dataset loaded:")
print(dataset)

In [None]:
# Load Tokenizer
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_func(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_func, batched=True)

In [None]:
# Setup Evaluation Metric: Accuracy
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
# Using small batch sizes to ensure it runs smoothly
args = TrainingArguments(
    output_dir="../models/checkpoints",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    use_cpu=False  # Will use MPS (Mac GPU) if available automatically
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
# Run Training
print("Starting training...")
trainer.train()

In [None]:
# Save the model for further analysis in 3_Greenwashing_Analysis.ipynb
save_path = "../models/gw_finetuned"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Fine-tuned model saved to {save_path}")