# Imports

In [None]:
import torch
from torch import nn, optim
import pandas as pd
import numpy as np
import kagglehub
from kagglehub import KaggleDatasetAdapter
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.ticker as mtick
from wordcloud import WordCloud
import nltk, re, string, warnings, textwrap, datetime as dt
nltk.download('stopwords')
sns.set_theme(style="whitegrid")
warnings.filterwarnings("ignore")
import re
from collections import Counter
!pip install optuna
import optuna, wandb
from torch.utils.data import Dataset as DS, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from huggingface_hub import upload_file, login, notebook_login, HfApi
from datasets import Dataset, Features, Sequence, Value
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from transformers.trainer_callback import TrainerCallback
import os

# Setting Up GPU connection

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Generic "Full-Code" Fine-Tuning (Change names when comments say so)

## Data Loading

In [None]:
DATA_DIR = "Data"
PREPROCESS_VERSION = ""   # change to 1,2 or 3 for other preprocessing versions
train_df = pd.read_csv(f"{DATA_DIR}/Corona_NLP_train{PREPROCESS_VERSION}.csv", encoding="latin-1")
test_df  = pd.read_csv(f"{DATA_DIR}/Corona_NLP_test{PREPROCESS_VERSION}.csv",  encoding="latin-1")

train_df = train_df.rename(columns={"OriginalTweet":"text", "Sentiment":"label"})
test_df  = test_df.rename(columns={"OriginalTweet":"text", "Sentiment":"label"})

label_map = {'Extremely Negative':0,'Negative':1,'Neutral':2,
             'Positive':3,'Extremely Positive':4}

train_df["label"] = train_df.label.map(label_map).astype(int)
test_df["label"]  = test_df.label.map(label_map).astype(int)

train_df.head()

In [None]:
train_df, eval_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

print("Train size:", len(train_df))
print("Eval size:", len(eval_df))

## Defining the custom Dataset

In [None]:
class TweetDataset(DS):
    def __init__(self, dataframe, tokenizer, max_len):
      self.texts = dataframe['text'].tolist()
      self.labels = dataframe['label'].tolist()
      self.tokenizer = tokenizer
      self.max_len = max_len

    def __len__(self):
      return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return_dict = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
        return return_dict

## Early-stopping helping function

In [None]:
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

## Main training function

In [None]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial, hf_repo_id, hf_folder):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None
    model_save_path = "best_model.pt"

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader: #Iterates over the train_loader, which is a DataLoader object containing batches of training data.
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad() # Reset gradients
            outputs = model(input_ids, attention_mask=attention_mask) # Forward pass
            logits = outputs.logits # save the logits (the raw output of the model)
            loss = criterion(logits, labels) # Calculate loss

            loss.backward() # Backward pass
            optimizer.step() # Update weights using the optimizer

            # Accumulate training loss and predictions
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ###  Validation loop  ###
        model.eval() # Enable evaluation mode
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0

        all_val_labels = []
        all_val_preds = []

        with torch.no_grad(): # Disable gradient computation
            for batch in val_loader: # iterate on the val_loader's batches
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())


        # calculate metrics
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

        # Check for early stopping
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch)

        # Save the best model under the best_model_state parameter
        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        # Log metrics to Weights & Biases - THIS IS WHERE WE TRACK THE RESULTS AND THE PROCESS
        wandb.log({ #log == logging of the training process (e.g. results) - will be done each epoch
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1})

        if early_stop_flag:  # Checks whether the early stopping condition has been met, as indicated by the early_stop_flag
            break# Exits the training loop immediately if the early stopping condition is satisfied

    if best_model_state is not None:
      torch.save(best_model_state, model_save_path) # Save locally
      # Push the best model to Hugging Face Hub
      try:
          upload_file(
              path_or_fileobj=model_save_path,
              path_in_repo=f"{hf_folder}/best_model_trial_{trial.number}.pt",
              repo_id=hf_repo_id,
              commit_message=f"Upload best model from trial {trial.number}",
              token = "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"
          )
          print(f"Successfully pushed best model of trial {trial.number} to {hf_repo_id}")
      except Exception as e:
          print(f"Error pushing model to Hugging Face Hub: {e}")


    return best_val_accuracy

## Optuna objective function

In [None]:
# Objective Function for Optuna
def objective(trial, hf_repo_id, model_name, wandb_project_name, hf_folder):
    # Hyperparameter suggestions
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-2)
    patience = trial.suggest_int("patience", 2, 5)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 0, 2)
    max_len = trial.suggest_categorical("max_len", [64, 128])
    classifier_dropout = trial.suggest_float("dropout_prob", 0.1, 0.5) # Adding the classifier's dropout as a HP (original was 0.1)
    label_smoothing = trial.suggest_float("label_smoothing", 0.0, 0.2) # Adding label smoothing as a HP

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True) # initialize the model from HF, num_labels=5 since we have 5 classes (EDA).
    model.classifier.dropout = nn.Dropout(classifier_dropout) # Set the dropout rate
    model = model.to(device)

    train_dataset = TweetDataset(train_df, tokenizer, max_len) # Create the TweetDataset object
    val_dataset = TweetDataset(eval_df, tokenizer, max_len) # Create the TweetDataset object

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # insert into a DataLoader
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # insert into a DataLoader

    for param in model.deberta.parameters():    # Freeze layers
        param.requires_grad = False
    for param in model.deberta.encoder.layer[len(model.deberta.encoder.layer)-num_layers:].parameters():     # unfreeze the last "num_layers" of the encoder
        param.requires_grad = True
    for param in model.classifier.parameters():    #unfreeze the classifier
        param.requires_grad = True

    # Define optimizer and loss function
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project=wandb_project_name,
               config={
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "patience": patience,
        "batch_size": batch_size,
        "num_layers": num_layers,
        "max_len": max_len,
        "classifier_drouput": classifier_dropout,
        "label_smoothing": label_smoothing,
        "architecture": model_name},
        name=f"trial_{trial.number}") # The name that will be saved in the W&B platform

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs=20, patience=patience, trial=trial, hf_repo_id=hf_repo_id, hf_folder=hf_folder)

    wandb.finish() # Finish the Weights & Biases run

    return best_val_accuracy # Return best validation acc as the objective to maximize

## Optuna Study aiming at maximizing model's accuracy

In [None]:
hf_repo_id = "CarmelKron/ADV_DL_Project"    # repo ID
model_name = "microsoft/deberta-v3-base"    # Change to cardiffnlp/twitter-roberta-base-sentiment-latest to run the Twitter-RoBERTa model
wandb_project_name = "DeBERTa-v3-base_FT_Full"    # Change accordingly
hf_folder = "Full_FT_DeBERTa-v3-base"   # Change accordingly
# Optuna Study
study = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function
study.optimize(lambda trial: objective(trial, hf_repo_id, model_name, wandb_project_name, hf_folder), n_trials=20)

# Generic HF Fine-Tuning (Change names when comments say so)

## Data Loading

In [None]:
DATA_DIR = "Data"
PREPROCESS_VERSION = ""   # change to 1,2 or 3 for other preprocessing versions
train_df = pd.read_csv(f"{DATA_DIR}/Corona_NLP_train{PREPROCESS_VERSION}.csv", encoding="latin-1")
test_df  = pd.read_csv(f"{DATA_DIR}/Corona_NLP_test{PREPROCESS_VERSION}.csv",  encoding="latin-1")

if PREPROCESS_VERSION == 3:
  # Count rows where the 'Narratives' column is not an empty list
  non_empty_narratives_count = train_df[train_df['Narratives'] != '[]'].shape[0]
  print(f"Number of rows with non-empty narratives: {non_empty_narratives_count}")

train_df = train_df.rename(columns={"OriginalTweet":"text", "Sentiment":"label"})
test_df  = test_df.rename(columns={"OriginalTweet":"text", "Sentiment":"label"})

label_map = {'Extremely Negative':0,'Negative':1,'Neutral':2,
             'Positive':3,'Extremely Positive':4}

train_df["label"] = train_df.label.map(label_map).astype(int)
test_df["label"]  = test_df.label.map(label_map).astype(int)

train_df.head()

In [None]:
train_df, eval_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

print("Train size:", len(train_df))
print("Eval size:", len(eval_df))

## Fine-Tuning

In [None]:
# Setup environment
os.environ["WANDB_PROJECT"] = "HF_FT_Carmel_Orig_Data"    # Change accordingly
wandb.login()
model_ckpt = "microsoft/deberta-v3-base"    # Change to cardiffnlp/twitter-roberta-base-sentiment-latest to run the Twitter-RoBERTa model

# Keep raw datasets (we'll tokenize per trial)
train_raw_ds = Dataset.from_pandas(train_df[["text", "label"]])
eval_raw_ds  = Dataset.from_pandas(eval_df[["text", "label"]])

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

optuna_trial = None

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=5, ignore_mismatched_sizes=True)
    return model

def optuna_objective(trial, wandb_project_name, hf_repo_id, hf_folder):
    global optuna_trial
    optuna_trial = trial

    # Sample hyperparameters
    max_len = trial.suggest_categorical("max_seq_length", [64, 128])
    learning_rate = trial.suggest_float("learning_rate", 5e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.3)
    batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    warmup_ratio  = trial.suggest_float("warmup_ratio", 0.01, 0.2)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts"])
    adam_beta1 = trial.suggest_float("adam_beta1", 0.85, 0.95)
    adam_beta2 = trial.suggest_float("adam_beta2", 0.98, 0.999)
    adam_epsilon = trial.suggest_float("adam_epsilon", 1e-8, 1e-6, log=True)
    label_smoothing_factor = trial.suggest_float("label_smoothing_factor", 0.0, 0.2)
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4])


    # Tokenize per trial
    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_len)

    train_ds = train_raw_ds.map(tokenize, batched=True)
    eval_ds  = eval_raw_ds.map(tokenize, batched=True)
    train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    eval_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    output_dir = f"./trial_{trial.number}"
    args = TrainingArguments(
        lr_scheduler_type=lr_scheduler_type,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        label_smoothing_factor=label_smoothing_factor,
        gradient_accumulation_steps=gradient_accumulation_steps,
        output_dir=output_dir,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=20,
        warmup_ratio=warmup_ratio,
        eval_strategy="epoch",
        save_strategy="best",
        load_best_model_at_end=True,
        save_total_limit=1,
        metric_for_best_model="eval_accuracy",
        logging_strategy="epoch",
        report_to="wandb",
        disable_tqdm=False,
        run_name=f"trial_{trial.number}"
    )

    wandb.init(
    project=wandb_project_name,
    name=f"trial_{trial.number}",
    reinit="finish_previous",
)

    trainer = Trainer(
        model_init=model_init,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4),]
    )

    trainer.train()

    # Push manually
    api = HfApi()
    api.upload_folder(
              folder_path=output_dir,
              path_in_repo=f"./{hf_folder}/trial_{trial.number}",
              repo_id=hf_repo_id,
              commit_message=f"Upload best model from trial {trial.number}",
              repo_type="model",
              token = "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"
          )

    return trainer.evaluate()["eval_accuracy"]

wandb_project_name = "HF_FT_Carmel_Orig_Data"   # Change accordingly
hf_repo_id = "CarmelKron/ADV_DL_Project"    # repo ID
hf_folder = "HF_FT_Carmel_Orig_Data"   # Change accordingly

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: optuna_objective(trial, wandb_project_name, hf_repo_id, hf_folder), n_trials=15)

wandb.finish()

# Print all trials summary
print("All trial results:")
for t in study.trials:
    print(f"Trial {t.number} — Accuracy: {t.value:.4f} — Params: {t.params}")
print(f"\n✅ Best Trial: {study.best_trial.number} — Accuracy: {study.best_value:.4f}")