In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import json
import os
import zipfile
import random
import logging
from typing import List, Dict, Any

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
import numpy as np

MODEL_NAME = "microsoft/deberta-v3-base"
DEV_FILE_PATH = "/content/mrbench_v3_devset.json"
TEST_FILE_PATH = "/content/mrbench_v3_testset.json"
OUTPUT_DIR = "bea2025_track1_output"
PREDICTIONS_FILENAME = "predictions.json"
ZIP_FILENAME = "predictions.json.zip"

BATCH_SIZE = 8
LEARNING_RATE = 2e-5
EPOCHS = 8
MAX_SEQ_LENGTH = 512
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
SEED = 42

LABEL_MAP = {"Yes": 0, "To some extent": 1, "No": 2}
ID_TO_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
NUM_LABELS = len(LABEL_MAP)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(SEED)

def load_data(file_path: str) -> List[Dict[str, Any]]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} conversations from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        raise
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {file_path}")
        raise

def preprocess_data(raw_data: List[Dict[str, Any]], is_test_set: bool = False) -> List[Dict[str, Any]]:
    processed_examples = []
    for conversation in raw_data:
        conv_id = conversation["conversation_id"]
        history = conversation["conversation_history"].strip()
        for tutor_id, response_data in conversation["tutor_responses"].items():
            tutor_response = response_data["response"].strip()
            combined_text = f"Conversation History:\n{history}\n\nTutor Response:\n{tutor_response}"
            example = {
                "conversation_id": conv_id,
                "tutor_id": tutor_id,
                "text": combined_text,
            }
            if not is_test_set:
                try:
                    label_str = response_data["annotation"]["Mistake_Identification"]
                    example["label"] = LABEL_MAP[label_str]
                except KeyError:
                    print(f"Missing 'Mistake_Identification' annotation for {conv_id}/{tutor_id}. Skipping.")
                    continue
                except Exception as e:
                     print(f"Error processing annotation for {conv_id}/{tutor_id}: {e}. Skipping.")
                     continue
            processed_examples.append(example)
    print(f"Preprocessed into {len(processed_examples)} individual examples.")
    return processed_examples

class PedagogicalAbilityDataset(Dataset):
    def __init__(self, data: List[Dict[str, Any]], tokenizer, max_length: int, is_test: bool = False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if 'token_type_ids' in encoding:
            inputs['token_type_ids'] = encoding['token_type_ids'].flatten()
        if not self.is_test:
            inputs['labels'] = torch.tensor(item['label'], dtype=torch.long)
        inputs['metadata'] = {
            'conversation_id': item['conversation_id'],
            'tutor_id': item['tutor_id']
        }
        return inputs

def calculate_metrics(preds: np.ndarray, labels: np.ndarray) -> Dict[str, float]:
    accuracy = accuracy_score(labels, preds)
    f1_strict = f1_score(labels, preds, average='macro', labels=[0, 1, 2], zero_division=0)
    lenient_labels = np.where(labels == 2, 1, 0)
    lenient_preds = np.where(preds == 2, 1, 0)
    f1_lenient = f1_score(lenient_labels, lenient_preds, average='macro', labels=[0, 1], zero_division=0)
    return {
        "accuracy": accuracy,
        "f1_macro_strict": f1_strict,
        "f1_macro_lenient": f1_lenient
    }

def train_epoch(model, data_loader, optimizer, scheduler, device, grad_accum_steps):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    progress_bar = tqdm(data_loader, desc="Training Epoch", leave=True)
    for i, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss / grad_accum_steps
        loss.backward()
        if (i + 1) % grad_accum_steps == 0 or (i + 1) == len(data_loader):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        total_loss += loss.item() * grad_accum_steps
        progress_bar.set_postfix({'loss': loss.item() * grad_accum_steps})
    avg_loss = total_loss / len(data_loader)
    print(f"Average Training Loss: {avg_loss:.4f}")
    return avg_loss

def evaluate(model, data_loader, device) -> Dict[str, float]:
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    progress_bar = tqdm(data_loader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(data_loader)
    metrics = calculate_metrics(np.array(all_preds), np.array(all_labels))
    metrics["eval_loss"] = avg_loss
    print(f"Evaluation Results: Loss={metrics['eval_loss']:.4f}, "
          f"Acc={metrics['accuracy']:.4f}, "
          f"F1_Strict={metrics['f1_macro_strict']:.4f}, "
          f"F1_Lenient={metrics['f1_macro_lenient']:.4f}")
    return metrics

if __name__ == "__main__":
    print("Starting BEA 2025 Shared Task Track 1 Solution")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    print("Loading and preprocessing data...")
    raw_dev_data = load_data(DEV_FILE_PATH)
    processed_dev_data = preprocess_data(raw_dev_data, is_test_set=False)
    if not processed_dev_data:
       print("No valid development examples found after preprocessing. Exiting.")
       exit()
    try:
        train_data, val_data = train_test_split(
            processed_dev_data,
            test_size=0.1,
            random_state=SEED,
            stratify=[d['label'] for d in processed_dev_data]
        )
        print(f"Split dev data: {len(train_data)} train, {len(val_data)} validation examples.")
    except ValueError as e:
         print(f"Could not stratify split (maybe too few samples per class?): {e}. Using random split.")
         train_data, val_data = train_test_split(
            processed_dev_data,
            test_size=0.1,
            random_state=SEED
         )
         print(f"Split dev data (random): {len(train_data)} train, {len(val_data)} validation examples.")
    print(f"Loading tokenizer: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("Creating datasets and dataloaders...")
    train_dataset = PedagogicalAbilityDataset(train_data, tokenizer, MAX_SEQ_LENGTH)
    val_dataset = PedagogicalAbilityDataset(val_data, tokenizer, MAX_SEQ_LENGTH)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    print(f"Loading pre-trained model: {MODEL_NAME}")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        id2label=ID_TO_LABEL_MAP,
        label2id=LABEL_MAP
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = (len(train_dataloader) // GRADIENT_ACCUMULATION_STEPS) * EPOCHS
    num_warmup_steps = int(num_training_steps * WARMUP_PROPORTION)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    print(f"Optimizer and Scheduler configured. Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    best_f1_strict = -1.0
    best_epoch = -1
    best_model_path = os.path.join(OUTPUT_DIR, "best_model")
    print("Starting training...")
    for epoch in range(EPOCHS):
        print(f"--- Epoch {epoch + 1}/{EPOCHS} ---")
        train_epoch(model, train_dataloader, optimizer, scheduler, device, GRADIENT_ACCUMULATION_STEPS)
        eval_metrics = evaluate(model, val_dataloader, device)
        current_f1_strict = eval_metrics["f1_macro_strict"]
        if current_f1_strict > best_f1_strict:
            best_f1_strict = current_f1_strict
            best_epoch = epoch + 1
            print(f"New best model found! F1 Strict: {best_f1_strict:.4f}. Saving model to {best_model_path}")
            model.save_pretrained(best_model_path)
            tokenizer.save_pretrained(best_model_path)
        else:
            print(f"F1 Strict ({current_f1_strict:.4f}) did not improve from best ({best_f1_strict:.4f}).")
    print(f"Training finished. Best model from epoch {best_epoch} with F1 Strict: {best_f1_strict:.4f}")
    print("--- Predicting on Test Set ---")
    print(f"Loading best model from {best_model_path}")
    model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
    tokenizer = AutoTokenizer.from_pretrained(best_model_path)
    model.to(device)
    model.eval()
    raw_test_data = load_data(TEST_FILE_PATH)
    processed_test_data = preprocess_data(raw_test_data, is_test_set=True)
    if not processed_test_data:
        print("No test examples found after preprocessing. Cannot generate predictions.")
        exit()
    test_dataset = PedagogicalAbilityDataset(processed_test_data, tokenizer, MAX_SEQ_LENGTH, is_test=True)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False)
    all_predictions = []
    print("Running inference on the test set...")
    progress_bar_test = tqdm(test_dataloader, desc="Predicting", leave=False)
    with torch.no_grad():
        for batch in progress_bar_test:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            for i in range(len(preds)):
                pred_label_id = preds[i]
                pred_label_str = ID_TO_LABEL_MAP[pred_label_id]
                metadata = batch['metadata']
                all_predictions.append({
                    "conversation_id": metadata['conversation_id'][i],
                    "tutor_id": metadata['tutor_id'][i],
                    "predicted_label": pred_label_str
                })
    print(f"Generated {len(all_predictions)} predictions for the test set.")
    print("Formatting predictions for submission...")
    submission_data = {}
    test_conv_map = {conv["conversation_id"]: conv for conv in raw_test_data}
    for pred in all_predictions:
        conv_id = pred["conversation_id"]
        tutor_id = pred["tutor_id"]
        predicted_label = pred["predicted_label"]
        if conv_id not in submission_data:
            original_conv = test_conv_map.get(conv_id)
            if not original_conv:
                 print(f"Original conversation {conv_id} not found in raw test data map. Skipping.")
                 continue
            submission_data[conv_id] = {
                "conversation_id": conv_id,
                "conversation_history": original_conv["conversation_history"],
                "tutor_responses": {}
            }
        original_response_text = test_conv_map[conv_id]["tutor_responses"].get(tutor_id, {}).get("response", "RESPONSE_NOT_FOUND")
        if original_response_text == "RESPONSE_NOT_FOUND":
             print(f"Original response for {conv_id}/{tutor_id} not found. Using placeholder.")
        submission_data[conv_id]["tutor_responses"][tutor_id] = {
            "response": original_response_text,
            "annotation": {
                "Mistake_Identification": predicted_label
            }
        }
    final_submission_list = list(submission_data.values())
    output_json_path = os.path.join(OUTPUT_DIR, PREDICTIONS_FILENAME)
    output_zip_path = os.path.join(OUTPUT_DIR, ZIP_FILENAME)
    print(f"Saving predictions to {output_json_path}")
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(final_submission_list, f, indent=2, ensure_ascii=False)
    print(f"Zipping predictions to {output_zip_path}")
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        zf.write(output_json_path, arcname=PREDICTIONS_FILENAME)
    print(f"--- Task Completed ---")
    print(f"Best model saved in: {best_model_path}")
    print(f"Predictions saved to: {output_json_path}")
    print(f"Submission zip file saved to: {output_zip_path}")

Starting BEA 2025 Shared Task Track 1 Solution
Using device: cuda
Loading and preprocessing data...
Loaded 300 conversations from /content/mrbench_v3_devset.json
Preprocessed into 2476 individual examples.
Split dev data: 2228 train, 248 validation examples.
Loading tokenizer: microsoft/deberta-v3-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Creating datasets and dataloaders...
Loading pre-trained model: microsoft/deberta-v3-base


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimizer and Scheduler configured. Total steps: 2232, Warmup steps: 223
Starting training...
--- Epoch 1/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Average Training Loss: 0.7610


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.6572, Acc=0.7823, F1_Strict=0.2926, F1_Lenient=0.4597
New best model found! F1 Strict: 0.2926. Saving model to bea2025_track1_output/best_model
--- Epoch 2/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.6030


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.4650, Acc=0.8548, F1_Strict=0.5232, F1_Lenient=0.8057
New best model found! F1 Strict: 0.5232. Saving model to bea2025_track1_output/best_model
--- Epoch 3/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.4935


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.4723, Acc=0.8589, F1_Strict=0.5500, F1_Lenient=0.8466
New best model found! F1 Strict: 0.5500. Saving model to bea2025_track1_output/best_model
--- Epoch 4/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.4256


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.5775, Acc=0.8629, F1_Strict=0.6443, F1_Lenient=0.8117
New best model found! F1 Strict: 0.6443. Saving model to bea2025_track1_output/best_model
--- Epoch 5/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.3936


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.5316, Acc=0.8750, F1_Strict=0.6227, F1_Lenient=0.8533
F1 Strict (0.6227) did not improve from best (0.6443).
--- Epoch 6/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.3354


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.5922, Acc=0.8629, F1_Strict=0.6786, F1_Lenient=0.8495
New best model found! F1 Strict: 0.6786. Saving model to bea2025_track1_output/best_model
--- Epoch 7/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.3080


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.6199, Acc=0.8548, F1_Strict=0.6718, F1_Lenient=0.8466
F1 Strict (0.6718) did not improve from best (0.6786).
--- Epoch 8/8 ---


Training Epoch:   0%|          | 0/279 [00:00<?, ?it/s]

Average Training Loss: 0.2750


Evaluating:   0%|          | 0/31 [00:00<?, ?it/s]

Evaluation Results: Loss=0.6070, Acc=0.8750, F1_Strict=0.6934, F1_Lenient=0.8565
New best model found! F1 Strict: 0.6934. Saving model to bea2025_track1_output/best_model
Training finished. Best model from epoch 8 with F1 Strict: 0.6934
--- Predicting on Test Set ---
Loading best model from bea2025_track1_output/best_model
Loaded 191 conversations from /content/mrbench_v3_testset.json
Preprocessed into 1547 individual examples.
Running inference on the test set...


Predicting:   0%|          | 0/97 [00:00<?, ?it/s]

Generated 1547 predictions for the test set.
Formatting predictions for submission...
Saving predictions to bea2025_track1_output/predictions.json
Zipping predictions to bea2025_track1_output/predictions.json.zip
--- Task Completed ---
Best model saved in: bea2025_track1_output/best_model
Predictions saved to: bea2025_track1_output/predictions.json
Submission zip file saved to: bea2025_track1_output/predictions.json.zip
