# **Final report with test:**

In [None]:
import os, re, html, torch, numpy as np, pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import torch
from pathlib import Path
from transformers import DataCollatorWithPadding
import torch.nn as nn

In [None]:
# #collab
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# === Load data ===
#test_df = pd.read_csv('/content/drive/MyDrive/deep_learning/val_processed.csv', encoding='latin1')
import pandas as pd

# Navigate from model folder to data folder
current_dir = Path.cwd()  # models/bert-base-uncased/
models_dir = current_dir.parent  # models/
project_root = models_dir.parent  # project root
data_dir = models_dir / 'data'

# Load data
test_df = pd.read_csv(data_dir / 'test_processed.csv', encoding='latin1')
train_df = pd.read_csv(data_dir / 'train_processed.csv', encoding='latin1')
eval_df = pd.read_csv(data_dir / 'val_processed.csv', encoding='latin1')

print(f"Test shape: {test_df.shape}")

In [None]:
# --------- config ----------
#MODEL_DIR  = "/content/drive/MyDrive/deep_learning"
MODEL_NAME = "bert-base-uncased"
MODEL_DIR  =data_dir




# Map model files to the preprocessing they were trained with
MODEL_PREPROC = {
    "best_BERT_base_uncasedmodel1.pt": "clean",
    "best_BERT_base_uncasedmodel2.pt": "clean",
    "best_BERT_base_uncasedmodel3.pt": "clean",
}

# Define label order
ordered_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
label2id = {lbl:i for i,lbl in enumerate(ordered_labels)}
id2label = {i:lbl for lbl,i in label2id.items()}


In [None]:
test_df["label"] = test_df["Sentiment"].map(label2id)

In [None]:
def clean_for_cardiffnlp(text):
    if pd.isnull(text):
        return ""

    tokens = []
    for t in text.split(" "):
        if t.startswith("@") and len(t) > 1:
            tokens.append("@user")
        elif t.startswith("http"):
            tokens.append("http")
        else:
            tokens.append(t)
    text = " ".join(tokens)

    # Normalize common COVID variants to "covid"
    text = re.sub(r"\b(coronaviruspandemic|covid[_\s-]*2019|covid[_\s-]*19|covid2019|coronavirus2019|coronavirus|corona)\b", "covid", text, flags=re.IGNORECASE)

    # Decode HTML entities
    text = html.unescape(text)

    # Normalize whitespace and repeated punctuation (optional)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"(\.\s*){2,}", ". ", text)
    text = re.sub(r"([!?]){2,}", r"\1", text)
    text = re.sub(r"(\?\s+){2,}", "?", text)
    text = re.sub(r"(\!\s+){2,}", "!", text)

    return text


In [None]:
# Build a light/dirty column for test alongside your existing ProcessedTweet
test_df["ProcessedTweet_light"] = test_df["OriginalTweet"].apply(clean_for_cardiffnlp)

In [None]:
BASE_CHECKPOINT = "bert-base-uncased"
LABEL_COL = "label"
BATCH_SIZE = 128
NUM_WORKERS = 2
PIN_MEMORY = True


# ==== tokenizer + collator ====
tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT, use_fast=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ==== Class Dataset ====
class TextClsDataset(Dataset):
    def __init__(self, df, text_col, label_col):
        self.texts = df[text_col].astype(str).tolist()
        self.labels = df[label_col].astype(int).tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=128,   # לשנות אם צריך
            return_tensors=None
        )
        enc["labels"] = self.labels[idx]
        return enc

# prepere datasets
test_ds_clean = TextClsDataset(test_df, text_col="ProcessedTweet",        label_col=LABEL_COL)
test_ds_light = TextClsDataset(test_df, text_col="ProcessedTweet_light",  label_col=LABEL_COL)

# DataLoaders
test_loader_clean = DataLoader(
    test_ds_clean,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    collate_fn=collator
)

test_loader_light = DataLoader(
    test_ds_light,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    collate_fn=collator
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Use the same model class as in training
class ConfigurableBertModel(nn.Module):
    def __init__(self, model_name: str, num_labels: int, dropout_rate: float = 0.1):
        super(ConfigurableBertModel, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return type('ModelOutput', (), {'logits': logits})()



FOR REGULAR Hipper Farameters Fine Tuning (only optuna):

In [None]:
#MODEL_DIR = "/content/drive/MyDrive/deep_learning"
MODEL_DIR  =data_dir
# Map -> preprocessing
MODEL_PREPROC = {
    "best_BERT_base_uncasedmodel1.pt": "clean",
    "best_BERT_base_uncasedmodel2.pt": "clean",
    "best_BERT_base_uncasedmodel3.pt": "clean",
}

# Evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Classification report
    print("Classification Report:")
    print(classification_report(
        all_labels, all_preds,
        target_names=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
    ))

    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1": f1_score(all_labels, all_preds, average='macro'),
        "precision": precision_score(all_labels, all_preds, average='macro'),
        "recall": recall_score(all_labels, all_preds, average='macro')
    }


# Evaluate all saved models
def evaluate_all_models(model_dir, model_preproc_map, test_loader_clean, device):
    results = {}

    for model_file, preproc_type in model_preproc_map.items():
        print(f"\n=== Evaluating {model_file} ({preproc_type}) ===")

        model_path = os.path.join(model_dir, model_file)

        # IMPORTANT: use ConfigurableBertModel, same as in training
        model = ConfigurableBertModel(
            model_name="bert-base-uncased",
            num_labels=5,
            dropout_rate=0.2  # you can change if needed
        )
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)

        if preproc_type == "clean":
            test_loader = test_loader_clean
        elif preproc_type == "light":
            test_loader = test_loader_light
        else:
            raise ValueError(f"Unknown preprocessing type: {preproc_type}")

        metrics = evaluate_model(model, test_loader, device)
        results[model_file] = metrics

    print("\n=== Summary of All Models ===")
    for model_file, metrics in results.items():
        print(f"{model_file}: "
              f"Acc={metrics['accuracy']:.4f}, "
              f"F1={metrics['f1']:.4f}, "
              f"Precision={metrics['precision']:.4f}, "
              f"Recall={metrics['recall']:.4f}")

    return results


In [None]:
results = evaluate_all_models(MODEL_DIR, MODEL_PREPROC, test_loader_clean, device)


=== Evaluating best_BERT_base_uncasedmodel1.pt (clean) ===
Classification Report:
                    precision    recall  f1-score   support

Extremely Negative       0.88      0.90      0.89       592
          Negative       0.86      0.83      0.84      1041
           Neutral       0.82      0.87      0.85       619
          Positive       0.82      0.84      0.83       947
Extremely Positive       0.89      0.84      0.87       599

          accuracy                           0.85      3798
         macro avg       0.85      0.86      0.85      3798
      weighted avg       0.85      0.85      0.85      3798


=== Evaluating best_BERT_base_uncasedmodel2.pt (clean) ===
Classification Report:
                    precision    recall  f1-score   support

Extremely Negative       0.86      0.89      0.87       592
          Negative       0.85      0.85      0.85      1041
           Neutral       0.94      0.86      0.90       619
          Positive       0.83      0.83      0.83 

FOR API Hipper Farameters Fine Tuning:

In [None]:
import os
import re
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# ==============================
# Config
# ==============================
MODEL_DIR  =data_dir
BASE_CHECKPOINT = "bert-base-uncased"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Evaluate these saved heads (your files)
MODEL_FILES = [
    "HF_best_model_stage1.pt",
    "HF_best_model_stage2.pt",
    "HF_best_model_stage3.pt",
    "HF_best_model_stage3b.pt",
]

ORDERED_LABELS = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
LABEL2ID = {l: i for i, l in enumerate(ORDERED_LABELS)}

# ==============================
# Tokenizer (+ special tokens)
# ==============================
tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT, use_fast=True)
specials = {"additional_special_tokens": ["<httpurl>", "<user>", "<hashtag>", "<emoji>"]}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)


# ==============================
# Preprocessing (match training)
#   - URLs  -> <httpurl>
#   - @user -> <user>
#   - hashtags -> prefix with <hashtag> (keep the hashtag token)
#   - emojis -> prefix with <emoji> before the emoji cluster
# ==============================
_EMOJI_RE = re.compile(
    r'['
    r'\U0001F1E0-\U0001F1FF'  # flags
    r'\U0001F300-\U0001F5FF'  # symbols & pictographs
    r'\U0001F600-\U0001F64F'  # emoticons
    r'\U0001F680-\U0001F6FF'  # transport & map
    r'\U0001F700-\U0001F77F'
    r'\U0001F780-\U0001F7FF'
    r'\U0001F800-\U0001F8FF'
    r'\U0001F900-\U0001F9FF'
    r'\U0001FA00-\U0001FA6F'
    r'\U0001FA70-\U0001FAFF'
    r'\u2600-\u26FF\u2700-\u27BF'
    r']+'
)

def preprocess_with_markers(t: str) -> str:
    if not isinstance(t, str):
        return ""
    t = re.sub(r'https?://\S+', '<httpurl>', t)
    t = re.sub(r'@\w+', '<user>', t)
    # add "<hashtag> " before hashtags that don't already have it
    t = re.sub(r'(?<!<hashtag>)#\w+', lambda m: f"<hashtag> {m.group(0)}", t)
    # add "<emoji> " before emoji clusters
    t = _EMOJI_RE.sub(lambda m: f"<emoji> {m.group(0)}", t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

# ==============================
# Dataset
# ==============================
class TDataset(Dataset):
    def __init__(self, texts, labels, max_len=128):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, i):
        enc = tokenizer(
            self.texts[i],
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item

# ==============================
# Model (same head you trained)
# ==============================
class BertWithDropout(nn.Module):
    def __init__(self, model_name, num_labels=5, dropout_rate=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout  = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_labels)
    def forward(self, input_ids, attention_mask=None, labels=None, token_type_ids=None):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        cls = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(cls))
        return {"logits": logits}

# ==============================
# Evaluation loop
# ==============================
def evaluate_model(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            ids   = batch["input_ids"].to(DEVICE)
            mask  = batch.get("attention_mask")
            ttids = batch.get("token_type_ids")  # BERT supplies these
            if mask  is not None:  mask  = mask.to(DEVICE)
            if ttids is not None: ttids = ttids.to(DEVICE)

            labels = batch["labels"].cpu().numpy()
            logits = model(ids, attention_mask=mask, token_type_ids=ttids)["logits"]
            preds  = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

    print(classification_report(all_labels, all_preds, target_names=ORDERED_LABELS, digits=4))
    return {
        "accuracy":  accuracy_score(all_labels, all_preds),
        "f1":        f1_score(all_labels, all_preds, average='macro'),
        "precision": precision_score(all_labels, all_preds, average='macro'),
        "recall":    recall_score(all_labels, all_preds, average='macro'),
    }

# ==============================
# Prepare test data and run
#   Expects: test_df with columns ["OriginalTweet","Sentiment"]
# ==============================
test_df["label"] = test_df["Sentiment"].map(LABEL2ID)

texts  = [preprocess_with_markers(x) for x in test_df["OriginalTweet"].astype(str)]
labels = test_df["label"].tolist()
ds = TDataset(texts, labels)
dl = DataLoader(ds, batch_size=128, shuffle=False, collate_fn=collator)

for model_file in MODEL_FILES:
    print(f"\n=== Evaluating {model_file} ===")
    model = BertWithDropout("bert-base-uncased", num_labels=len(ORDERED_LABELS), dropout_rate=0.2)


    state = torch.load(os.path.join(MODEL_DIR, model_file), map_location=DEVICE)
    model.load_state_dict(state, strict=True)
    model.to(DEVICE)

    metrics = evaluate_model(model, dl)
    print(f"Acc={metrics['accuracy']:.4f}, F1(macro)={metrics['f1']:.4f}, "
          f"Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")


=== Evaluating HF_best_model_stage1.pt ===
                    precision    recall  f1-score   support

Extremely Negative     0.8484    0.9172    0.8815       592
          Negative     0.8780    0.8021    0.8384      1041
           Neutral     0.7746    0.9047    0.8346       619
          Positive     0.8463    0.8374    0.8418       947
Extremely Positive     0.9104    0.8314    0.8691       599

          accuracy                         0.8502      3798
         macro avg     0.8515    0.8586    0.8531      3798
      weighted avg     0.8538    0.8502    0.8502      3798

Acc=0.8502, F1(macro)=0.8531, Precision=0.8515, Recall=0.8586

=== Evaluating HF_best_model_stage2.pt ===
                    precision    recall  f1-score   support

Extremely Negative     0.9155    0.8598    0.8868       592
          Negative     0.8279    0.8828    0.8545      1041
           Neutral     0.8513    0.8691    0.8601       619
          Positive     0.8198    0.8405    0.8300       947
Extrem