# **Final report with test:**

In [1]:
import os, re, html, torch, numpy as np, pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import torch
from transformers import DataCollatorWithPadding
import torch.nn as nn

In [2]:
#collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [27]:
# === Load data ===
test_df = pd.read_csv('/content/drive/MyDrive/deep_learning/val_processed.csv', encoding='latin1')


In [15]:
# --------- config ----------
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL_DIR  = "/content/drive/MyDrive/deep_learning"

# Map model files to the preprocessing they were trained with
MODEL_PREPROC = {
    "best_model0.pt": "clean",
    "best_model2.pt": "clean",
    "best_model3.pt": "clean",
    "best_model4.pt": "clean",
    "best_model5.pt": "light",
    "best_model6.pt": "light"# trained on light/dirty pipeline
}

# Your label order
ordered_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
label2id = {lbl:i for i,lbl in enumerate(ordered_labels)}
id2label = {i:lbl for lbl,i in label2id.items()}

In [28]:
test_df["label"] = test_df["Sentiment"].map(label2id)

In [29]:
def clean_for_cardiffnlp(text):
    if pd.isnull(text):
        return ""

    tokens = []
    for t in text.split(" "):
        if t.startswith("@") and len(t) > 1:
            tokens.append("@user")
        elif t.startswith("http"):
            tokens.append("http")
        else:
            tokens.append(t)
    text = " ".join(tokens)

    # Normalize common COVID variants to "covid"
    text = re.sub(r"\b(coronaviruspandemic|covid[_\s-]*2019|covid[_\s-]*19|covid2019|coronavirus2019|coronavirus|corona)\b", "covid", text, flags=re.IGNORECASE)

    # Decode HTML entities
    text = html.unescape(text)

    # Normalize whitespace and repeated punctuation (optional)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"(\.\s*){2,}", ". ", text)
    text = re.sub(r"([!?]){2,}", r"\1", text)
    text = re.sub(r"(\?\s+){2,}", "?", text)
    text = re.sub(r"(\!\s+){2,}", "!", text)

    return text


In [30]:
# Build a light/dirty column for test alongside your existing ProcessedTweet
test_df["ProcessedTweet_light"] = test_df["OriginalTweet"].apply(clean_for_cardiffnlp)

In [31]:
BASE_CHECKPOINT = "cardiffnlp/twitter-roberta-base-sentiment-latest"
LABEL_COL = "label"
BATCH_SIZE = 128
NUM_WORKERS = 2
PIN_MEMORY = True


# ==== tokenizer + collator ====
tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT, use_fast=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ==== Class Dataset ====
class TextClsDataset(Dataset):
    def __init__(self, df, text_col, label_col):
        self.texts = df[text_col].astype(str).tolist()
        self.labels = df[label_col].astype(int).tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=128,   # לשנות אם צריך
            return_tensors=None
        )
        enc["labels"] = self.labels[idx]
        return enc

# prepere datasets
test_ds_clean = TextClsDataset(test_df, text_col="ProcessedTweet",        label_col=LABEL_COL)
test_ds_light = TextClsDataset(test_df, text_col="ProcessedTweet_light",  label_col=LABEL_COL)

# DataLoaders
test_loader_clean = DataLoader(
    test_ds_clean,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    collate_fn=collator
)

test_loader_light = DataLoader(
    test_ds_light,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    collate_fn=collator
)

In [20]:
# ============ CUSTOM ROBERTA WITH DROPOUT ============
class RobertaWithDropout(nn.Module):
    """
    Custom RoBERTa model with configurable dropout
    """
    def __init__(self, model_name, num_labels, dropout_rate=0.1):
        super(RobertaWithDropout, self).__init__()

        self.roberta = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return type('obj', (object,), {'logits': logits})()


FOR REGULAR Hipper Farameters Fine Tuning (only optuna):

In [32]:
MODEL_DIR = "/content/drive/MyDrive/deep_learning"

# Map -> preprocessing
MODEL_PREPROC = {
    "best_model0.pt": "clean",
    "best_model2.pt": "clean",
    "best_model3.pt": "clean",
    "best_model4.pt": "clean",
    "best_model5.pt": "light",
    "best_model6.pt": "light",
}

# Eval Function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Report
    print("Classification Report:")
    print(classification_report(
        all_labels, all_preds,
        target_names=['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']
    ))

    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1": f1_score(all_labels, all_preds, average='macro'),
        "precision": precision_score(all_labels, all_preds, average='macro'),
        "recall": recall_score(all_labels, all_preds, average='macro')
    }

# Eval All Models
def evaluate_all_models(model_dir, model_preproc_map, test_loader_clean, test_loader_light, device):
    results = {}

    for model_file, preproc_type in model_preproc_map.items():
        print(f"\n=== Evaluating {model_file} ({preproc_type}) ===")

        model_path = os.path.join(model_dir, model_file)
        model = RobertaWithDropout(
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        num_labels=5,
        dropout_rate=0.2)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)

        if preproc_type == "clean":
            test_loader = test_loader_clean
        elif preproc_type == "light":
            test_loader = test_loader_light
        else:
            raise ValueError(f"Unknown preprocessing type: {preproc_type}")


        metrics = evaluate_model(model, test_loader, device)
        results[model_file] = metrics


    print("\n=== Summary of All Models ===")
    for model_file, metrics in results.items():
        print(f"{model_file}: "
              f"Acc={metrics['accuracy']:.4f}, "
              f"F1={metrics['f1']:.4f}, "
              f"Precision={metrics['precision']:.4f}, "
              f"Recall={metrics['recall']:.4f}")

    return results


In [33]:
results = evaluate_all_models(MODEL_DIR, MODEL_PREPROC, test_loader_clean, test_loader_light, device)


=== Evaluating best_model0.pt (clean) ===
Classification Report:
                    precision    recall  f1-score   support

Extremely Negative       0.85      0.85      0.85      1096
Extremely Positive       0.85      0.78      0.81      1983
          Negative       0.84      0.89      0.86      1543
           Neutral       0.84      0.87      0.86      2285
          Positive       0.90      0.88      0.89      1325

          accuracy                           0.85      8232
         macro avg       0.85      0.86      0.85      8232
      weighted avg       0.85      0.85      0.85      8232


=== Evaluating best_model2.pt (clean) ===
Classification Report:
                    precision    recall  f1-score   support

Extremely Negative       0.84      0.91      0.87      1096
Extremely Positive       0.85      0.83      0.84      1983
          Negative       0.90      0.87      0.88      1543
           Neutral       0.84      0.87      0.86      2285
          Positive      

FOR API Hipper Farameters Fine Tuning:

In [34]:
import os
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# ===== config =====
MODEL_DIR = "/content/drive/MyDrive/deep_learning"
BASE_CHECKPOINT = "cardiffnlp/twitter-roberta-base-sentiment-latest"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# מודלים להערכה (רק API)
MODEL_PREPROC = {
    "HF_best_model_stage1.pt": "clean",
    "HF_best_model_stage2.pt": "clean",# trained on light/dirty pipeline
    "HF_best_model_stage3.pt": "light",
}

ORDERED_LABELS = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
LABEL2ID = {l:i for i,l in enumerate(ORDERED_LABELS)}

# ===== tokenizer + collator =====
tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT, use_fast=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===== preprocessing functions =====
def preprocess_clean(t: str) -> str:
    import re, html
    if not isinstance(t, str): return ""
    toks=[]
    for w in t.split(" "):
        if w.startswith("@") and len(w)>1: toks.append("@user")
        elif w.startswith("http"): toks.append("http")
        else: toks.append(w)
    t=" ".join(toks)
    t=re.sub(r"\b(coronaviruspandemic|covid[_\s-]*2019|covid[_\s-]*19|covid2019|coronavirus2019|coronavirus|corona)\b",
             "covid", t, flags=re.IGNORECASE)
    t=html.unescape(t)
    t=re.sub(r"\s+"," ",t).strip()
    return t

def preprocess_light(t: str) -> str:
    import re
    if not isinstance(t, str): return ""
    t=re.sub(r'https?://\S+','HTTPURL',t)
    t=re.sub(r'@\w+','@USER',t)
    t=re.sub(r'\s+',' ',t).strip()
    return t

# ===== dataset =====
class TDataset(Dataset):
    def __init__(self, texts, labels, max_len=128):
        self.texts=texts; self.labels=labels; self.max_len=max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = tokenizer(self.texts[i], truncation=True, max_length=self.max_len, return_tensors="pt")
        item = {k:v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item

# ===== model class =====
class RobertaWithDropout(nn.Module):
    def __init__(self, model_name, num_labels=5, dropout_rate=0.2):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
    def forward(self, input_ids, attention_mask=None, labels=None):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:,0,:]
        logits = self.classifier(self.dropout(cls))
        return {"logits": logits}

# ===== evaluation =====
def evaluate_model(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            ids = batch["input_ids"].to(DEVICE)
            mask = batch.get("attention_mask")
            if mask is not None:
                mask = mask.to(DEVICE)
            labels = batch["labels"].cpu().numpy()
            logits = model(ids, attention_mask=mask)["logits"]
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    print(classification_report(all_labels, all_preds, target_names=ORDERED_LABELS, digits=4))
    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1": f1_score(all_labels, all_preds, average='macro'),
        "precision": precision_score(all_labels, all_preds, average='macro'),
        "recall": recall_score(all_labels, all_preds, average='macro')
    }

# ===== run all API models =====
test_df["label"] = test_df["Sentiment"].map(LABEL2ID)

for model_file, preproc_type in MODEL_PREPROC.items():
    print(f"\n=== Evaluating {model_file} ({preproc_type}) ===")
    if preproc_type == "clean":
        texts = [preprocess_clean(x) for x in test_df["OriginalTweet"].astype(str)]
    else:
        texts = [preprocess_light(x) for x in test_df["OriginalTweet"].astype(str)]
    labels = test_df["label"].tolist()

    ds = TDataset(texts, labels)
    dl = DataLoader(ds, batch_size=128, shuffle=False, collate_fn=collator)

    model = RobertaWithDropout(BASE_CHECKPOINT, num_labels=len(ORDERED_LABELS), dropout_rate=0.2)
    model.load_state_dict(torch.load(os.path.join(MODEL_DIR, model_file), map_location=DEVICE))
    model.to(DEVICE)

    metrics = evaluate_model(model, dl)
    print(f"Acc={metrics['accuracy']:.4f}, F1={metrics['f1']:.4f}, "
          f"Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")



=== Evaluating HF_best_model_stage1.pt (clean) ===
                    precision    recall  f1-score   support

Extremely Negative     0.8355    0.8942    0.8638      1096
          Negative     0.8527    0.8114    0.8315      1983
           Neutral     0.8873    0.8678    0.8775      1543
          Positive     0.8337    0.8425    0.8380      2285
Extremely Positive     0.8597    0.8785    0.8690      1325

          accuracy                         0.8524      8232
         macro avg     0.8538    0.8589    0.8560      8232
      weighted avg     0.8527    0.8524    0.8523      8232

Acc=0.8524, F1=0.8560, Precision=0.8538, Recall=0.8589

=== Evaluating HF_best_model_stage2.pt (clean) ===
                    precision    recall  f1-score   support

Extremely Negative     0.8834    0.8504    0.8666      1096
          Negative     0.8347    0.8427    0.8386      1983
           Neutral     0.8347    0.9067    0.8692      1543
          Positive     0.8557    0.8280    0.8416      22