In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datasets import Dataset
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import torch
from datasets import Dataset



In [None]:
from notebooks.EDA import cleaning_and_processing_testdata, cleaning_and_processing, clean_tweet

In [None]:
# @title
#Please : i renamed the dfs to add clarity
obama_df = pd.read_excel('/content/training-Obama-Romney-tweets.xlsx', sheet_name='Obama')
romney_df = pd.read_excel('/content/training-Obama-Romney-tweets.xlsx', sheet_name='Romney')


In [None]:
def train_roberta_for_candidate(candidate: str):
    """
    Please : call this with 'Obama' or 'Romney'.
    It will:
      - use the corresponding df (obama_df or romney_df)
      - prepare the dataset
      - fine-tune Cardiff twitter RoBERTa
      - print eval metrics + classification report (-1,0,1)
      - save the model + tokenizer in ./models/roberta_<candidate>_cardiff
    """
    name = candidate.strip().capitalize()
    if name == "Obama":
        base_df = obama_df
    elif name == "Romney":
        base_df = romney_df
    else:
        raise ValueError("candidate must be 'Obama' or 'Romney'")

    print(f"\n===== Training RoBERTa (Cardiff) for {name} tweets =====\n")

    #Helps us map our dataset labels to/from roberta labels
    label_to_id = {-1: 0, 0: 1, 1: 2}
    id_to_label = {v: k for k, v in label_to_id.items()}

    #keep only labels -1,0,1 and add numeric label_id
    exp_df = base_df[base_df["label"].isin([-1, 0, 1])].copy()
    exp_df["label_id"] = exp_df["label"].map(label_to_id).astype(int)

    #apply cleaning and get labels list to use in the next cells
    texts = exp_df["clean_tweet"].astype(str).tolist()
    labels = exp_df["label_id"].tolist()

    #Training and testing split
    X_train, X_val, y_train, y_val = train_test_split(
        texts,
        labels,
        test_size=0.2,
        random_state=42,
        stratify=labels,
    )
    '''
    # trying resampling ------------------------------------------------------------
    # df must contain a column "label"
    df = X_train.copy()

    neg = df[df.label_id == 0]
    neu = df[df.label_id == 1]
    pos = df[df.label_id == 2]

    # Find the largest class size
    max_size = max(len(neg), len(neu), len(pos))

    neg_over = resample(neg, replace=True, n_samples=max_size, random_state=42)
    neu_over = resample(neu, replace=True, n_samples=max_size, random_state=42)
    pos_over = resample(pos, replace=True, n_samples=max_size, random_state=42)

    X_train = pd.concat([neg_over, neu_over, pos_over]).sample(frac=1, random_state=42)

    ---------------------------------------------------------------------------------
    '''

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
    val_ds   = Dataset.from_dict({"text": X_val,   "label": y_val})

    #load RoBERTa tokenizer
    model_path = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    def tokenize_batch(batch):
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=96,  # Please : adjust later if needed
        )

    #apply to train and validation sets
    train_ds_tokenized = train_ds.map(tokenize_batch, batched=True)
    val_ds_tokenized   = val_ds.map(tokenize_batch, batched=True)

    #format for PyTorch
    train_ds_tokenized = train_ds_tokenized.remove_columns(["text"])
    val_ds_tokenized   = val_ds_tokenized.remove_columns(["text"])

    train_ds_tokenized.set_format("torch")
    val_ds_tokenized.set_format("torch")

    #Metrics function for Trainer
    def compute_metrics(eval_pred):
        logits, labels_np = eval_pred
        preds = np.argmax(logits, axis=-1)

        acc = accuracy_score(labels_np, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels_np, preds, average="macro", zero_division=0
        )
        return {
            "accuracy": acc,
            "macro_f1": f1,
            "macro_precision": precision,
            "macro_recall": recall,
        }

    #load RoBERTa model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=3,  # classes: 0,1,2 corresponding to -1,0,1
    )

    #Training Arguments
    training_args = TrainingArguments(
        output_dir=f"./roberta_{name.lower()}_cardiff",   #Please : folder where checkpoints/logs are saved
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        num_train_epochs=4,                    #will change later in tuning
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        report_to="none",
    )

    #Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_tokenized,
        eval_dataset=val_ds_tokenized,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    #Train and evaluate
    #Please : this tunes the Cardiff twitter roberta model on {name} tweets
    trainer.train()

    eval_results = trainer.evaluate()
    print(f"==== RoBERTa (Cardiff) {name} eval metrics ====")
    print(eval_results)

    #Detailed classification report in terms of -1,0,1
    pred_output = trainer.predict(val_ds_tokenized)
    logits = pred_output.predictions
    pred_ids = np.argmax(logits, axis=-1)

    true_ids = np.array(y_val)  # y_val was label_id from earlier split (0,1,2)

    # map back 0,1,2 to -1,0,1 using id_to_label defined earlier
    true_labels = [id_to_label[i] for i in true_ids]
    pred_labels = [id_to_label[i] for i in pred_ids]

    print(f"\n==== RoBERTa (Cardiff) {name} - classification report (labels -1,0,1) ====\n")
    print(classification_report(true_labels, pred_labels))

    #Save the fine-tuned model for later
    save_dir = f"./models/roberta_{name.lower()}_cardiff"
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"\nSaved fine-tuned {name} RoBERTa model to: {save_dir}")

    return trainer, eval_results


In [None]:
# Train for Obama
trainer_obama, obama_results = train_roberta_for_candidate("Obama")

In [None]:
# Train for Romney
trainer_romney, romney_results = train_roberta_for_candidate("Romney")