# Evaluation Notebook
This notebook computes metrics, plots the confusion matrix, and performs error analysis.

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)

plt.rcParams['figure.figsize'] = (6,6)
plt.rcParams['font.size'] = 11


In [None]:
test_df = pd.read_csv("data/test.csv")  # update path

texts = test_df["text"].tolist()
labels = test_df["label"].tolist()

label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

id2label = {v: k for k, v in label2id.items()}
y_true = [label2id[l] for l in labels]
num_classes = len(label2id)


In [None]:
model_path = "YOUR_MODEL_PATH"  # update this

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()


In [None]:
def predict_batch(texts, batch_size=16):
    all_logits = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, padding=True, truncation=True,
            return_tensors="pt", max_length=256
        )
        enc = {k: v.to(device) for k,v in enc.items()}
        with torch.no_grad():
            out = model(**enc).logits.cpu().numpy()
        all_logits.append(out)

    logits = np.concatenate(all_logits)
    preds = np.argmax(logits, axis=1)
    return preds, logits

y_pred, logits = predict_batch(texts)


In [None]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

pd.DataFrame({
    "Metric": ["Accuracy", "Precision (Macro)", "Recall (Macro)", "F1 (Macro)"],
    "Value": [accuracy, precision, recall, f1]
})


In [None]:
cm = confusion_matrix(y_true, y_pred)
cm_norm = cm / cm.sum(axis=1, keepdims=True)

fig, ax = plt.subplots()
ax.imshow(cm_norm, cmap="Blues")

ax.set_xticks(range(num_classes))
ax.set_yticks(range(num_classes))
ax.set_xticklabels(label2id.keys(), rotation=45)
ax.set_yticklabels(label2id.keys())

for i in range(num_classes):
    for j in range(num_classes):
        ax.text(j, i, f"{cm_norm[i,j]*100:.1f}%", ha="center", va="center")

ax.set_xlabel("Predicted")
ax.set_ylabel("True")
ax.set_title("Normalized Confusion Matrix")

plt.tight_layout()
plt.savefig("confusion_matrix_normalized.png", dpi=300)
plt.show()


In [None]:
report = classification_report(
    y_true, y_pred,
    target_names=list(label2id.keys()),
    output_dict=True,
    zero_division=0
)

class_f1 = {cls: report[cls]["f1-score"] for cls in label2id.keys()}
worst_class = min(class_f1, key=class_f1.get)
worst_id = label2id[worst_class]

print("Worst class:", worst_class)

mis_idx = [i for i,(t,p) in enumerate(zip(y_true,y_pred)) if t==worst_id and p!=worst_id]

print("Misclassified examples:", len(mis_idx))

for i in mis_idx[:2]:
    print("\n===================================")
    print("Index:", i)
    print("True label:", id2label[y_true[i]])
    print("Pred label:", id2label[y_pred[i]])
    print("Text:", texts[i])
