In [None]:
# =====================================
# üöÄ Optimized BERT Fine-Tuning for Sentiment Analysis (with Evaluation)
# =====================================
import torch
import numpy as np
import pandas as pd
import random, re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup, DataCollatorWithPadding
)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# --------------------
# 1Ô∏è‚É£ Setup & Reproducibility
# --------------------
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

# --------------------
# 2Ô∏è‚É£ Data Preparation
# --------------------
df = pd.read_csv('./smileannotationsfinal.csv', names=['id', 'text', 'category']).set_index('id')

# Clean categories
df = df[~df.category.str.contains(r'\|', regex=True)]
df = df[df.category != 'nocode']

# Label encoding
label_dict = {cat: idx for idx, cat in enumerate(df.category.unique())}
df['label'] = df.category.replace(label_dict)

# Clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r"http\S+|@\w+|#\w+", "", text)
        return re.sub(r"\s+", " ", text).strip()
    return "[EMPTY]"

df['text'] = df['text'].apply(clean_text)
df = df[df['text'].str.strip() != ""]

# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=seed_val,
    stratify=df.label.values
)

df['data_type'] = 'not_set'
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# --------------------
# 3Ô∏è‚É£ Tokenization (Dynamic Padding)
# --------------------
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def encode_data(texts):
    return tokenizer(
        texts.tolist(),
        truncation=True,
        padding=False,     # dynamic padding handled by DataCollator
        max_length=192
    )

train_enc = encode_data(df[df.data_type=='train'].text)
val_enc   = encode_data(df[df.data_type=='val'].text)

# Custom Dataset for dynamic padding
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_enc, df[df.data_type=='train'].label.values)
val_dataset   = SentimentDataset(val_enc, df[df.data_type=='val'].label.values)

# --------------------
# 4Ô∏è‚É£ Model Setup
# --------------------
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
).to(device)

# Class weights (optional)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df.label.values),
    y=df.label.values
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# --------------------
# 5Ô∏è‚É£ Training Configuration
# --------------------
epochs = 10
batch_size = 16
gradient_accumulation_steps = 2
lr = 2e-5
warmup_ratio = 0.1
weight_decay = 0.01

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size,
    collate_fn=data_collator
)
val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size,
    collate_fn=data_collator
)

optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8, weight_decay=weight_decay)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * warmup_ratio),
    num_training_steps=total_steps
)
scaler = GradScaler()

# --------------------
# 6Ô∏è‚É£ Training Loop
# --------------------
best_f1 = 0
patience, no_improve = 3, 0

for epoch in range(1, epochs+1):
    model.train()
    total_loss, total_correct, total_samples = 0, 0, 0
    progress = tqdm(train_dataloader, desc=f"Epoch {epoch}")

    for step, batch in enumerate(progress):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == batch['labels']).sum().item()
        total_samples += len(batch['labels'])

        progress.set_postfix(loss=f"{loss.item():.3f}", acc=f"{total_correct/total_samples:.3f}")

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_acc = total_correct / total_samples

    # --- Validation ---
    model.eval()
    preds_all, true_all = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds_all.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_all.extend(batch['labels'].cpu().numpy())

    val_f1 = f1_score(true_all, preds_all, average='weighted')
    val_acc = (np.array(preds_all) == np.array(true_all)).mean()

    print(f"\nEpoch {epoch} Summary:")
    print(f"Train Loss={avg_train_loss:.4f}, Train Acc={avg_train_acc:.4f}")
    print(f"Val F1={val_f1:.4f}, Val Acc={val_acc:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        no_improve = 0
        model.save_pretrained("best_bert_sentiment")
        tokenizer.save_pretrained("best_bert_sentiment")
        print("‚úÖ New best model saved!")
    else:
        no_improve += 1
        print(f"No improvement ({no_improve}/{patience})")
        if no_improve >= patience:
            print("Early stopping!")
            break

# --------------------
# 7Ô∏è‚É£ Evaluation Helpers
# --------------------
def evaluate_simple(dataloader):
    """Runs model evaluation and returns predictions + true labels"""
    model.eval()
    preds_all, true_all = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds_all.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_all.extend(batch['labels'].cpu().numpy())
    return np.array(preds_all), np.array(true_all)


def accuracy_per_class(preds, true, label_dict=label_dict):
    """Prints accuracy per class with readable labels"""
    for label, idx in label_dict.items():
        idxs = np.where(true == idx)
        acc = (preds[idxs] == true[idxs]).mean() * 100
        print(f"Label: {label:<20} | Accuracy: {acc:.2f}% ({len(idxs[0])} samples)")


# --------------------
# 8Ô∏è‚É£ Final Evaluation
# --------------------
print("\nüî• Final Evaluation on Validation Set")
model = AutoModelForSequenceClassification.from_pretrained("best_bert_sentiment").to(device)
predictions, true_vals = evaluate_simple(val_dataloader)

print("\nClassification Report:")
print(classification_report(true_vals, predictions, target_names=label_dict.keys()))

print("\nConfusion Matrix:")
print(confusion_matrix(true_vals, predictions))

print("\nPer-Class Accuracy:")
accuracy_per_class(predictions, true_vals)


Using: cuda


  df['label'] = df.category.replace(label_dict)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 1 Summary:
Train Loss=1.1969, Train Acc=0.5860
Val F1=0.6785, Val Acc=0.7763
‚úÖ New best model saved!


Epoch 2:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 2 Summary:
Train Loss=0.5732, Train Acc=0.8200
Val F1=0.7841, Val Acc=0.8265
‚úÖ New best model saved!


Epoch 3:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 3 Summary:
Train Loss=0.4051, Train Acc=0.8789
Val F1=0.8696, Val Acc=0.8858
‚úÖ New best model saved!


Epoch 4:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 4 Summary:
Train Loss=0.2990, Train Acc=0.9144
Val F1=0.8514, Val Acc=0.8721
No improvement (1/3)


Epoch 5:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 5 Summary:
Train Loss=0.2250, Train Acc=0.9322
Val F1=0.8888, Val Acc=0.9041
‚úÖ New best model saved!


Epoch 6:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 6 Summary:
Train Loss=0.1619, Train Acc=0.9532
Val F1=0.8971, Val Acc=0.9087
‚úÖ New best model saved!


Epoch 7:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 7 Summary:
Train Loss=0.1303, Train Acc=0.9645
Val F1=0.8964, Val Acc=0.9087
No improvement (1/3)


Epoch 8:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 8 Summary:
Train Loss=0.0924, Train Acc=0.9750
Val F1=0.8956, Val Acc=0.9087
No improvement (2/3)


Epoch 9:   0%|          | 0/78 [00:00<?, ?it/s]

  with autocast():



Epoch 9 Summary:
Train Loss=0.0734, Train Acc=0.9806
Val F1=0.8859, Val Acc=0.8995
No improvement (3/3)
Early stopping!

üî• Final Evaluation on Validation Set

Classification Report:
              precision    recall  f1-score   support

       happy       0.95      0.96      0.96       170
not-relevant       0.76      0.76      0.76        29
       angry       0.82      1.00      0.90         9
     disgust       0.00      0.00      0.00         1
         sad       0.00      0.00      0.00         5
    surprise       0.62      1.00      0.77         5

    accuracy                           0.91       219
   macro avg       0.53      0.62      0.56       219
weighted avg       0.89      0.91      0.90       219


Confusion Matrix:
[[163   5   0   0   0   2]
 [  5  22   1   0   0   1]
 [  0   0   9   0   0   0]
 [  0   1   0   0   0   0]
 [  3   1   1   0   0   0]
 [  0   0   0   0   0   5]]

Per-Class Accuracy:
Label: happy                | Accuracy: 95.88% (170 samples)
Label: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
