In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score
from torch.cuda.amp import autocast, GradScaler

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

MODEL_NAME = "gpt2-xl"
NUM_CLASSES = 3
BATCH_SIZE = 32
EPOCHS = 5
LR = 2e-5

from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

train_df = pd.read_csv("mistake_identification_train.csv")
val_df = pd.read_csv("mistake_identification_val.csv")
class PedagogyDataset(Dataset):
    def __init__(self, df, tokenizer):
        df = df[df['mistake_identification'].isin([0, 1, 2])].copy()
        self.texts = df["response"].tolist()
        self.labels = df["mistake_identification"].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=64,
            return_tensors="pt"
        )
        return {
            'input_ids': encodings["input_ids"].squeeze(0),
            'attention_mask': encodings["attention_mask"].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = PedagogyDataset(train_df, tokenizer)
val_dataset = PedagogyDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

class DecoderClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME, num_classes=3, dropout=0.1):
        super(DecoderClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden_size = self.transformer.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        device = self.classifier.weight.device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )

        hidden_states = outputs.last_hidden_state  # shape (B, T, H)
        last_token_index = (input_ids != tokenizer.pad_token_id).sum(dim=1) - 1
        batch_size = input_ids.size(0)
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size())
        masked_hidden = hidden_states * attention_mask_expanded
        pooled_output = masked_hidden.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


model = DecoderClassifier(model_name=MODEL_NAME, num_classes=NUM_CLASSES)
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LR)
scaler = GradScaler()

train_losses = []
val_losses = []

def evaluate(model, dataloader):
    model.eval()
    y_true, y_pred = [], []
    val_running_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['labels'].cuda()
            with autocast():
                logits = model(input_ids, attention_mask)
                labels = labels.to(logits.device)
                loss = loss_fn(logits, labels)
                val_running_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    val_loss = val_running_loss / len(val_loader)
    val_losses.append(val_loss)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, f1, val_loss

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()

        with autocast():
            logits = model(input_ids, attention_mask)
            labels = labels.to(logits.device)  # ensure alignment
            loss = loss_fn(logits, labels)
            running_loss += loss.item()

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        torch.cuda.empty_cache()

        train_loss = running_loss/len(train_loader)
        train_losses.append(train_loss)

    acc, f1, val_loss = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1/5 | Train Loss: 0.6845 | Val Loss: 0.5731 | Acc: 0.8499 | F1: 0.5017


  with autocast():
  with autocast():


Epoch 2/5 | Train Loss: 0.5657 | Val Loss: 0.5291 | Acc: 0.8357 | F1: 0.5106


  with autocast():
  with autocast():


Epoch 3/5 | Train Loss: 0.5155 | Val Loss: 0.4853 | Acc: 0.8479 | F1: 0.5222


  with autocast():
  with autocast():


Epoch 4/5 | Train Loss: 0.4520 | Val Loss: 0.4460 | Acc: 0.8621 | F1: 0.6251


  with autocast():
  with autocast():


Epoch 5/5 | Train Loss: 0.4000 | Val Loss: 0.4816 | Acc: 0.8580 | F1: 0.6341


In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Train vs Validation Loss")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_conf_matrix(model, dataloader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['labels'].cuda()
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["No", "To some extent", "Yes"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix (Validation Set)")
    plt.show()

plot_conf_matrix(model, val_loader)