In [None]:
import json
import os
import zipfile
import random
import logging
from typing import List, Dict, Any, Optional
import math
import re 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
import numpy as np
from collections import Counter
import pandas as pd
import sklearn

MODEL_NAME = "microsoft/deberta-v3-large"
DEV_FILE_PATH = "/kaggle/input/bae-acl-dataset/mrbench_v3_devset.json"
TEST_FILE_PATH = "/kaggle/input/bae-acl-dataset/mrbench_v3_testset.json"
OUTPUT_DIR = "bea2025_track5_output_manual"
PREDICTIONS_FILENAME = "predictions.json"
ZIP_FILENAME = "predictions.json.zip"

TASK_TRACK_NAME = "Track 5 - Tutor Identification (Manual Loop)"
ANNOTATION_KEY = "Tutor_Identification"

OFFICIAL_LABELS = [
    "Expert", "Gemini", "GPT4", "Llama31405B", "Llama318B",
    "Mistral", "Novice", "Phi3", "Sonnet"
]
LABEL_MAP = {label: i for i, label in enumerate(OFFICIAL_LABELS)}
ID_TO_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
NUM_LABELS = len(LABEL_MAP)

DEV_SET_KEY_TO_LABEL = {
    "Expert": "Expert", "Novice": "Novice", "Human_Expert": "Expert",
    "Human_Novice": "Novice", "Gemini": "Gemini", "GPT4": "GPT4",
    "GPT-4": "GPT4", "Llama31405B": "Llama31405B",
    "Llama-3.1-405B": "Llama31405B", "Llama-3-1-405B": "Llama31405B",
    "Llama318B": "Llama318B", "Llama-3.1-8B": "Llama318B",
    "Llama-3-1-8B": "Llama318B", "Mistral": "Mistral", "Phi3": "Phi3",
    "Phi-3": "Phi3", "Sonnet": "Sonnet",
}

BATCH_SIZE = 2
LEARNING_RATE = 1.8e-5
WEIGHT_DECAY = 0.01
EPOCHS = 10
MAX_SEQ_LENGTH = 512
GRADIENT_ACCUMULATION_STEPS = 2
WARMUP_PROPORTION = 0.1
SEED = 42

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

set_seed(SEED)

def load_data(file_path: str) -> List[Dict[str, Any]]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        log_message = f"Loaded {len(data)} conversations from {file_path}"
        logger.info(log_message)
        print(log_message)
        return data
    except FileNotFoundError:
        error_message = f"Error: File not found at {file_path}"
        logger.error(error_message); print(error_message); raise
    except json.JSONDecodeError:
        error_message = f"Error: Could not decode JSON from {file_path}"
        logger.error(error_message); print(error_message); raise

def preprocess_data(raw_data: List[Dict[str, Any]], is_test_set: bool = False) -> List[Dict[str, Any]]:
    processed_examples = []
    skipped_count = 0
    unmapped_tutors = Counter()
    for conversation in raw_data:
        conv_id = conversation["conversation_id"]
        history = conversation.get("conversation_history", "").strip()
        for tutor_key, response_data in conversation.get("tutor_responses", {}).items():
            tutor_response = response_data.get("response", "").strip()
            if not tutor_response:
                 skipped_count += 1; continue
            combined_text = f"Conversation History:\n{history}\n\nTutor Response:\n{tutor_response}"
            example = {"conversation_id": conv_id, "tutor_id": tutor_key, "text": combined_text}
            if not is_test_set:
                official_label = DEV_SET_KEY_TO_LABEL.get(tutor_key)
                if official_label and official_label in LABEL_MAP:
                    example["label"] = LABEL_MAP[official_label]
                else:
                    unmapped_tutors[tutor_key] += 1; skipped_count += 1; continue
            processed_examples.append(example)
    log_message = f"Preprocessed into {len(processed_examples)} examples."
    if skipped_count > 0: log_message += f" Skipped {skipped_count}."
    if unmapped_tutors: log_message += f" Unmapped keys: {dict(unmapped_tutors)}"
    logger.info(log_message); print(log_message)
    return processed_examples

class TutorIdentificationDataset(Dataset):
    def __init__(self, data: List[Dict[str, Any]], tokenizer, max_length: int, is_test: bool = False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        encoding = self.tokenizer(text, add_special_tokens=True, max_length=self.max_length,
                                  padding='max_length', truncation=True, return_attention_mask=True,
                                  return_tensors='pt')
        inputs = {'input_ids': encoding['input_ids'].flatten(),
                  'attention_mask': encoding['attention_mask'].flatten()}
        if 'token_type_ids' in encoding and encoding['token_type_ids'] is not None:
             inputs['token_type_ids'] = encoding['token_type_ids'].flatten()
        if not self.is_test:
            if 'label' in item: inputs['labels'] = torch.tensor(item['label'], dtype=torch.long)
            else: raise ValueError(f"Missing label for dev item at index {idx}: {item}")
        inputs['metadata'] = {'conversation_id': item['conversation_id'], 'tutor_id': item['tutor_id']}
        return inputs

def calculate_metrics(preds: np.ndarray, labels: np.ndarray) -> Dict[str, float]:
    accuracy = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average='macro', labels=list(range(NUM_LABELS)), zero_division=0)
    return {"accuracy": accuracy, "f1_macro": f1_macro}

def train_epoch_manual(model, data_loader, loss_fct, optimizer, scheduler, device, grad_accum_steps, epoch_num, total_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    print(f"\n--- Training Epoch {epoch_num}/{total_epochs} ---")
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch_num} Training", leave=False)

    for i, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        token_type_ids = batch.get('token_type_ids')
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits

        loss = loss_fct(logits, labels)

        loss = loss / grad_accum_steps

        loss.backward()

        if (i + 1) % grad_accum_steps == 0 or (i + 1) == len(data_loader):
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        batch_loss = loss.item() * grad_accum_steps
        total_loss += batch_loss
        progress_bar.set_postfix({'loss': f"{batch_loss:.4f}"})

    avg_epoch_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch_num} Average Training Loss (per batch): {avg_epoch_loss:.4f}")
    logger.info(f"Epoch {epoch_num} Average Training Loss (per batch): {avg_epoch_loss:.4f}")
    return avg_epoch_loss

def evaluate_manual(model, data_loader, device, loss_fct, epoch_num=None):
    model.eval()
    all_preds = []
    all_labels = []
    total_eval_loss = 0

    epoch_str = f"Epoch {epoch_num} " if epoch_num is not None else ""
    print(f"\n--- {epoch_str}Evaluation ---")
    progress_bar = tqdm(data_loader, desc=f"{epoch_str}Evaluating", leave=False)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            token_type_ids = batch.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = outputs.logits

            loss = loss_fct(logits, labels)
            total_eval_loss += loss.item()

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            progress_bar.set_postfix({'eval_loss': f"{loss.item():.4f}"})

    avg_eval_loss = total_eval_loss / len(data_loader)
    metrics = calculate_metrics(np.array(all_preds), np.array(all_labels))
    metrics["eval_loss"] = avg_eval_loss

    result_message = (
        f"{epoch_str}Evaluation Results:"
        f"\n - Eval Loss (Weighted): {metrics['eval_loss']:.4f}"
        f"\n - Accuracy: {metrics['accuracy']:.4f}"
        f"\n - F1 Macro: {metrics['f1_macro']:.4f} (Primary Metric)"
    )
    logger.info(result_message)
    print(result_message)

    label_counts = np.bincount(np.array(all_preds), minlength=NUM_LABELS)
    print("\nValidation Prediction distribution:")
    total_p = len(all_preds)
    if total_p > 0:
        for label_id, count in enumerate(label_counts):
            perc = (count/total_p*100) if total_p > 0 else 0
            print(f"  {ID_TO_LABEL_MAP.get(label_id, '?')}: {count} ({perc:.1f}%)")
    else: print("  No predictions.")

    return metrics

if __name__ == "__main__":
    print("\n" + "="*80)
    print(f" BEA 2025 SHARED TASK {TASK_TRACK_NAME} SOLUTION ".center(80, "="))
    print("="*80 + "\n")

    logger.info(f"Starting BEA 2025 Shared Task {TASK_TRACK_NAME} Solution")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    print(f"Using device: {device}")
    if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}")

    print("\n" + "-"*80 + "\n DATA PREPARATION \n" + "-"*80)
    logger.info("Loading and preprocessing data...")
    print("Loading and preprocessing data...")
    raw_dev_data = load_data(DEV_FILE_PATH)
    processed_dev_data = preprocess_data(raw_dev_data, is_test_set=False)
    if not processed_dev_data:
        logger.error("No valid dev examples found. Exiting."); print("ERROR: No dev data."); exit()

    try:
        labels_for_stratify = [d['label'] for d in processed_dev_data]
        train_data, val_data = train_test_split(processed_dev_data, test_size=0.15,
                                                random_state=SEED, stratify=labels_for_stratify)
    except ValueError as e:
        logger.warning(f"Stratify failed: {e}. Using random split."); print(f"WARN: Stratify failed: {e}. Random split.")
        train_data, val_data = train_test_split(processed_dev_data, test_size=0.15, random_state=SEED)
    logger.info(f"Split: {len(train_data)} train, {len(val_data)} val.")
    print(f"Split: {len(train_data)} train, {len(val_data)} val.")

    print("\nCalculating class weights...")
    train_labels = [d['label'] for d in train_data]
    if not train_labels: logger.error("No labels in train split."); print("ERROR: No train labels."); exit()
    class_counts = Counter(train_labels)
    total_samples = len(train_labels)
    class_weights = []
    print("Training set label distribution:")
    for i in range(NUM_LABELS):
        count = class_counts.get(i, 0)
        label_name = ID_TO_LABEL_MAP.get(i, f"Class_{i}")
        percentage = (count / total_samples * 100) if total_samples > 0 else 0
        print(f"  {label_name}: {count} ({percentage:.1f}%)")
        weight = total_samples / (NUM_LABELS * (count + 1e-6))
        class_weights.append(weight)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print(f"\nClass weights (on device {device}): {class_weights_tensor.cpu().numpy().round(3)}")
    logger.info(f"Class weights: {class_weights_tensor.cpu().numpy().round(3).tolist()}")

    print("\n" + "-"*80 + "\n TOKENIZER AND DATALOADERS \n" + "-"*80)
    logger.info(f"Loading tokenizer: {MODEL_NAME}")
    print(f"Loading tokenizer: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    logger.info("Creating datasets and dataloaders...")
    print("Creating datasets and dataloaders...")
    train_dataset = TutorIdentificationDataset(train_data, tokenizer, MAX_SEQ_LENGTH)
    val_dataset = TutorIdentificationDataset(val_data, tokenizer, MAX_SEQ_LENGTH)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2, pin_memory=True)

    print("\n" + "-"*80 + "\n MODEL INITIALIZATION \n" + "-"*80)
    logger.info(f"Loading model: {MODEL_NAME} for {NUM_LABELS} labels.")
    print(f"Loading model: {MODEL_NAME} for {NUM_LABELS} labels.")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=NUM_LABELS, id2label=ID_TO_LABEL_MAP,
        label2id=LABEL_MAP, ignore_mismatched_sizes=True
    )
    model.to(device)
    print(f"Model moved to {device}")

    print("\n" + "-"*80 + "\n OPTIMIZER SETUP \n" + "-"*80)
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
    print(f"Using weighted CrossEntropyLoss.")
    logger.info("Using weighted CrossEntropyLoss.")

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
    print(f"Optimizer: AdamW, LR: {LEARNING_RATE}, Weight Decay: {WEIGHT_DECAY}")

    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)
    num_training_steps = num_update_steps_per_epoch * EPOCHS
    num_warmup_steps = int(num_training_steps * WARMUP_PROPORTION)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    print(f"Scheduler: Linear warmup ({num_warmup_steps} steps) then decay. Total steps: {num_training_steps}")

    print("\n" + "="*80 + "\n TRAINING (Manual Loop) \n" + "="*80)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    best_f1_macro = -1.0
    best_epoch = -1
    best_model_path = os.path.join(OUTPUT_DIR, "best_model_manual")

    logger.info("Starting manual training loop...")
    print("Starting manual training loop...")

    for epoch in range(EPOCHS):
        epoch_num = epoch + 1
        print(f"\n===== EPOCH {epoch_num}/{EPOCHS} =====")

        train_loss = train_epoch_manual(model, train_dataloader, loss_fct, optimizer, scheduler, device, GRADIENT_ACCUMULATION_STEPS, epoch_num, EPOCHS)

        eval_metrics = evaluate_manual(model, val_dataloader, device, loss_fct, epoch_num)
        current_f1_macro = eval_metrics["f1_macro"]

        if current_f1_macro > best_f1_macro:
            best_f1_macro = current_f1_macro
            best_epoch = epoch_num
            logger.info(f"*** New best model found! Epoch: {best_epoch}, F1 Macro: {best_f1_macro:.4f} ***")
            print(f"*** New best model found! Saving to {best_model_path} ***")
            model.save_pretrained(best_model_path)
            tokenizer.save_pretrained(best_model_path)
        else:
             logger.info(f"F1 Macro ({current_f1_macro:.4f}) did not improve from best ({best_f1_macro:.4f} from epoch {best_epoch}).")
             print(f"F1 Macro did not improve from best ({best_f1_macro:.4f}).")

    logger.info(f"Training finished. Best model from epoch {best_epoch} with F1 Macro: {best_f1_macro:.4f}")
    print(f"\nTraining finished. Best model from epoch {best_epoch} saved to {best_model_path}")

    print("\n" + "="*80 + "\n TEST SET PREDICTION \n" + "="*80)

    logger.info("Loading best model for test set prediction...")
    print(f"Loading best model from {best_model_path}...")

    try:
        model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
        tokenizer = AutoTokenizer.from_pretrained(best_model_path)
        model.to(device)
        model.eval()
        print("Best model loaded successfully.")
    except Exception as e:
        logger.error(f"Failed to load best model from {best_model_path}: {e}. Exiting.")
        print(f"ERROR: Failed to load best model: {e}. Cannot predict.")
        exit()

    logger.info("Loading and preprocessing test data...")
    print("Loading and preprocessing test data...")
    raw_test_data = load_data(TEST_FILE_PATH)
    processed_test_data = preprocess_data(raw_test_data, is_test_set=True)

    all_predictions_formatted = []
    if not processed_test_data:
        logger.warning("No test examples found after preprocessing.")
        print("WARNING: No test data to predict on.")
    else:
        test_dataset = TutorIdentificationDataset(processed_test_data, tokenizer, MAX_SEQ_LENGTH, is_test=True)
        test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2, pin_memory=True)

        logger.info("Running inference on test set...")
        print("Running inference on test set...")
        progress_bar_test = tqdm(test_dataloader, desc="Predicting", leave=False)
        all_pred_ids = []
        all_metadata = []

        with torch.no_grad():
            for batch in progress_bar_test:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch.get('token_type_ids')
                if token_type_ids is not None: token_type_ids = token_type_ids.to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=-1).cpu().numpy()
                all_pred_ids.extend(preds)

                batch_size = input_ids.size(0)
                for i in range(batch_size):
                     all_metadata.append({
                         'conversation_id': batch['metadata']['conversation_id'][i],
                         'tutor_id': batch['metadata']['tutor_id'][i]
                     })

        if len(all_pred_ids) != len(all_metadata):
             warning_message = f"Mismatch: {len(all_pred_ids)} predictions vs {len(all_metadata)} metadata items."
             logger.warning(warning_message); print(f"WARNING: {warning_message}")

        for i, pred_id in enumerate(all_pred_ids):
            if i < len(all_metadata):
                meta = all_metadata[i]
                pred_label_str = ID_TO_LABEL_MAP[pred_id]
                all_predictions_formatted.append({
                    "conversation_id": meta['conversation_id'],
                    "tutor_id": meta['tutor_id'],
                    "predicted_label": pred_label_str
                })
            else:
                 logger.warning(f"Skipping prediction index {i} due to metadata length mismatch.")

        predictions_message = f"Generated {len(all_predictions_formatted)} predictions."
        logger.info(predictions_message); print(f"\n{predictions_message}")

        if all_predictions_formatted:
            pred_labels_list = [p["predicted_label"] for p in all_predictions_formatted]
            label_counts = Counter(pred_labels_list)
            print("\nTest prediction distribution:")
            total_p = len(pred_labels_list)
            for label_name in OFFICIAL_LABELS:
                 count = label_counts.get(label_name, 0)
                 perc = (count / total_p * 100) if total_p > 0 else 0
                 print(f"  {label_name}: {count} ({perc:.1f}%)")
        else: print("No predictions generated.")

    print("\n" + "-"*80 + "\n FORMATTING PREDICTIONS \n" + "-"*80)
    logger.info("Formatting predictions for submission...")
    print("Formatting predictions for submission...")
    submission_data_dict = {}
    test_conv_map = {conv["conversation_id"]: conv for conv in raw_test_data}

    for pred in all_predictions_formatted:
        conv_id = pred["conversation_id"]
        tutor_id = pred["tutor_id"]
        predicted_label = pred["predicted_label"]
        if conv_id not in submission_data_dict:
            original_conv = test_conv_map.get(conv_id)
            if not original_conv: logger.warning(f"Conv {conv_id} not in test map."); print(f"W: Conv {conv_id} missing."); continue
            submission_data_dict[conv_id] = {
                "conversation_id": conv_id,
                "conversation_history": original_conv.get("conversation_history", "NF"),
                "tutor_responses": {}
            }
        original_response_text = "NF"
        if conv_id in test_conv_map and tutor_id in test_conv_map[conv_id].get("tutor_responses", {}):
             original_response_text = test_conv_map[conv_id]["tutor_responses"][tutor_id].get("response", "NF")
        submission_data_dict[conv_id]["tutor_responses"][tutor_id] = {
            "response": original_response_text,
            "annotation": {ANNOTATION_KEY: predicted_label}
        }

    final_submission_list = list(submission_data_dict.values())
    logger.info(f"Formatted {len(final_submission_list)} conversations.")
    print(f"Formatted {len(final_submission_list)} conversations.")

    print("\n" + "-"*80 + "\n SAVING & CHECKING \n" + "-"*80)
    output_json_path = os.path.join(OUTPUT_DIR, PREDICTIONS_FILENAME)
    output_zip_path = os.path.join(OUTPUT_DIR, ZIP_FILENAME)

    logger.info(f"Saving predictions to {output_json_path}")
    print(f"Saving predictions to {output_json_path}")
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(final_submission_list, f, indent=2, ensure_ascii=False)
        print(f"Successfully saved {output_json_path}")
    except Exception as e:
        logger.error(f"Failed to save predictions JSON: {e}"); print(f"ERROR saving JSON: {e}")

    print("\n--- Checking Completeness ---")
    predicted_pairs = set()
    for item in final_submission_list:
        conv_id = item['conversation_id']
        tutor_responses = item.get('tutor_responses', {})
        for tutor_id in tutor_responses:
            if ANNOTATION_KEY in tutor_responses[tutor_id].get("annotation", {}):
                 predicted_pairs.add((conv_id, tutor_id))
            else: logger.warning(f"Key {ANNOTATION_KEY} missing for {conv_id}/{tutor_id}"); print(f"W: Key miss {conv_id}/{tutor_id}")

    missing_predictions = []
    total_expected_responses = 0
    expected_pairs = set()
    for conv in raw_test_data:
        conv_id = conv['conversation_id']
        tutor_responses_original = conv.get('tutor_responses', {})
        for tutor_id, resp_data in tutor_responses_original.items():
            if resp_data.get("response"):
                 expected_pair = (conv_id, tutor_id)
                 if expected_pair not in expected_pairs:
                     expected_pairs.add(expected_pair)
                     total_expected_responses += 1
                     if expected_pair not in predicted_pairs:
                         missing_predictions.append(expected_pair)

    create_zip = False
    if not missing_predictions and total_expected_responses > 0:
        logger.info(f"SUCCESS: All {total_expected_responses} expected responses predicted.")
        print(f"SUCCESS: All {total_expected_responses} predictions present.")
        create_zip = True
    elif total_expected_responses == 0:
        logger.info("INFO: Test set appears empty or has no responses.")
        print("INFO: Test set empty or no responses found.")
        create_zip = True
    else:
        logger.error(f"ERROR: {len(missing_predictions)} missing predictions out of {total_expected_responses}.")
        print(f"ERROR: {len(missing_predictions)} missing out of {total_expected_responses}.")
        for i, (mcid, mtid) in enumerate(missing_predictions[:10]): print(f"  - Missing: ('{mcid}', '{mtid}')")
        if len(missing_predictions) > 10: print("  ...")

    final_zip_path = None
    if create_zip and os.path.exists(output_json_path):
        print("\n--- Zipping Predictions ---")
        logger.info(f"Zipping {output_json_path} to {output_zip_path}")
        print(f"Zipping predictions to {output_zip_path}")
        try:
            with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
                zf.write(output_json_path, arcname=PREDICTIONS_FILENAME)
            logger.info(f"Successfully created zip file: {output_zip_path}")
            print(f"Successfully created zip file: {output_zip_path}")
            final_zip_path = output_zip_path
        except Exception as e:
            logger.error(f"Failed creating zip file: {e}"); print(f"ERROR zipping: {e}")
    elif not os.path.exists(output_json_path):
         logger.error(f"JSON file {output_json_path} not found. Cannot create zip.")
         print(f"ERROR: JSON not found at {output_json_path}. Zip not created.")
    else:
        logger.error("Zip file NOT created due to missing predictions.")
        print("\nZip file NOT created due to missing predictions.")

    print("\n" + "="*80)
    print(f" {TASK_TRACK_NAME} TASK COMPLETED ".center(80, "="))
    print("="*80)
    logger.info(f"{TASK_TRACK_NAME} Task Completed")
    print(f"Best model saved in: {best_model_path}")
    logger.info(f"Best model saved in: {best_model_path}")
    print(f"Predictions saved to: {output_json_path if os.path.exists(output_json_path) else 'File not created'}")
    logger.info(f"Predictions saved to: {output_json_path if os.path.exists(output_json_path) else 'File not created'}")
    if final_zip_path:
        print(f"Submission zip file saved to: {final_zip_path}")
        logger.info(f"Submission zip file saved to: {final_zip_path}")
    else:
        print(f"Submission zip file was NOT created.")
        logger.warning(f"Submission zip file was NOT created.")


== BEA 2025 SHARED TASK Track 5 - Tutor Identification (Manual Loop) SOLUTION ==

Using device: cuda
GPU: Tesla P100-PCIE-16GB

--------------------------------------------------------------------------------
 DATA PREPARATION 
--------------------------------------------------------------------------------
Loading and preprocessing data...
Loaded 300 conversations from /kaggle/input/bae-acl-dataset/mrbench_v3_devset.json
Preprocessed into 2476 examples.
Split: 2104 train, 372 val.

Calculating class weights...
Training set label distribution:
  Expert: 255 (12.1%)
  Gemini: 255 (12.1%)
  GPT4: 255 (12.1%)
  Llama31405B: 255 (12.1%)
  Llama318B: 255 (12.1%)
  Mistral: 255 (12.1%)
  Novice: 64 (3.0%)
  Phi3: 255 (12.1%)
  Sonnet: 255 (12.1%)

Class weights (on device cuda): [0.917 0.917 0.917 0.917 0.917 0.917 3.653 0.917 0.917]

--------------------------------------------------------------------------------
 TOKENIZER AND DATALOADERS 
-------------------------------------------------

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Creating datasets and dataloaders...

--------------------------------------------------------------------------------
 MODEL INITIALIZATION 
--------------------------------------------------------------------------------
Loading model: microsoft/deberta-v3-large for 9 labels.


2025-04-22 12:52:36.620600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745326356.846080      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745326356.922465      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to cuda

--------------------------------------------------------------------------------
 OPTIMIZER SETUP 
--------------------------------------------------------------------------------
Using weighted CrossEntropyLoss.
Optimizer: AdamW, LR: 1.8e-05, Weight Decay: 0.01
Scheduler: Linear warmup (526 steps) then decay. Total steps: 5260

 TRAINING (Manual Loop) 
Starting manual training loop...

===== EPOCH 1/10 =====

--- Training Epoch 1/10 ---


Epoch 1 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 1 Average Training Loss (per batch): 2.1282

--- Epoch 1 Evaluation ---


Epoch 1 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 1 Evaluation Results:
 - Eval Loss (Weighted): 1.7058
 - Accuracy: 0.3656
 - F1 Macro: 0.3491 (Primary Metric)

Validation Prediction distribution:
  Expert: 28 (7.5%)
  Gemini: 92 (24.7%)
  GPT4: 11 (3.0%)
  Llama31405B: 1 (0.3%)
  Llama318B: 70 (18.8%)
  Mistral: 0 (0.0%)
  Novice: 16 (4.3%)
  Phi3: 22 (5.9%)
  Sonnet: 132 (35.5%)
*** New best model found! Saving to bea2025_track5_output_manual/best_model_manual ***

===== EPOCH 2/10 =====

--- Training Epoch 2/10 ---


Epoch 2 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 2 Average Training Loss (per batch): 0.9946

--- Epoch 2 Evaluation ---


Epoch 2 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 2 Evaluation Results:
 - Eval Loss (Weighted): 0.4576
 - Accuracy: 0.8333
 - F1 Macro: 0.8459 (Primary Metric)

Validation Prediction distribution:
  Expert: 48 (12.9%)
  Gemini: 40 (10.8%)
  GPT4: 45 (12.1%)
  Llama31405B: 58 (15.6%)
  Llama318B: 39 (10.5%)
  Mistral: 38 (10.2%)
  Novice: 12 (3.2%)
  Phi3: 46 (12.4%)
  Sonnet: 46 (12.4%)
*** New best model found! Saving to bea2025_track5_output_manual/best_model_manual ***

===== EPOCH 3/10 =====

--- Training Epoch 3/10 ---


Epoch 3 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 3 Average Training Loss (per batch): 0.5282

--- Epoch 3 Evaluation ---


Epoch 3 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 3 Evaluation Results:
 - Eval Loss (Weighted): 0.5463
 - Accuracy: 0.8548
 - F1 Macro: 0.8590 (Primary Metric)

Validation Prediction distribution:
  Expert: 41 (11.0%)
  Gemini: 48 (12.9%)
  GPT4: 50 (13.4%)
  Llama31405B: 60 (16.1%)
  Llama318B: 20 (5.4%)
  Mistral: 38 (10.2%)
  Novice: 12 (3.2%)
  Phi3: 49 (13.2%)
  Sonnet: 54 (14.5%)
*** New best model found! Saving to bea2025_track5_output_manual/best_model_manual ***

===== EPOCH 4/10 =====

--- Training Epoch 4/10 ---


Epoch 4 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 4 Average Training Loss (per batch): 0.3012

--- Epoch 4 Evaluation ---


Epoch 4 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 4 Evaluation Results:
 - Eval Loss (Weighted): 0.6390
 - Accuracy: 0.8683
 - F1 Macro: 0.8771 (Primary Metric)

Validation Prediction distribution:
  Expert: 42 (11.3%)
  Gemini: 50 (13.4%)
  GPT4: 47 (12.6%)
  Llama31405B: 50 (13.4%)
  Llama318B: 35 (9.4%)
  Mistral: 38 (10.2%)
  Novice: 12 (3.2%)
  Phi3: 54 (14.5%)
  Sonnet: 44 (11.8%)
*** New best model found! Saving to bea2025_track5_output_manual/best_model_manual ***

===== EPOCH 5/10 =====

--- Training Epoch 5/10 ---


Epoch 5 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 5 Average Training Loss (per batch): 0.1723

--- Epoch 5 Evaluation ---


Epoch 5 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 5 Evaluation Results:
 - Eval Loss (Weighted): 0.5922
 - Accuracy: 0.8952
 - F1 Macro: 0.9020 (Primary Metric)

Validation Prediction distribution:
  Expert: 43 (11.6%)
  Gemini: 50 (13.4%)
  GPT4: 47 (12.6%)
  Llama31405B: 46 (12.4%)
  Llama318B: 36 (9.7%)
  Mistral: 41 (11.0%)
  Novice: 12 (3.2%)
  Phi3: 48 (12.9%)
  Sonnet: 49 (13.2%)
*** New best model found! Saving to bea2025_track5_output_manual/best_model_manual ***

===== EPOCH 6/10 =====

--- Training Epoch 6/10 ---


Epoch 6 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 6 Average Training Loss (per batch): 0.0810

--- Epoch 6 Evaluation ---


Epoch 6 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 6 Evaluation Results:
 - Eval Loss (Weighted): 0.6729
 - Accuracy: 0.8817
 - F1 Macro: 0.8901 (Primary Metric)

Validation Prediction distribution:
  Expert: 42 (11.3%)
  Gemini: 46 (12.4%)
  GPT4: 46 (12.4%)
  Llama31405B: 57 (15.3%)
  Llama318B: 32 (8.6%)
  Mistral: 46 (12.4%)
  Novice: 12 (3.2%)
  Phi3: 44 (11.8%)
  Sonnet: 47 (12.6%)
F1 Macro did not improve from best (0.9020).

===== EPOCH 7/10 =====

--- Training Epoch 7/10 ---


Epoch 7 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 7 Average Training Loss (per batch): 0.0430

--- Epoch 7 Evaluation ---


Epoch 7 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 7 Evaluation Results:
 - Eval Loss (Weighted): 0.6945
 - Accuracy: 0.8817
 - F1 Macro: 0.8875 (Primary Metric)

Validation Prediction distribution:
  Expert: 41 (11.0%)
  Gemini: 47 (12.6%)
  GPT4: 51 (13.7%)
  Llama31405B: 56 (15.1%)
  Llama318B: 28 (7.5%)
  Mistral: 39 (10.5%)
  Novice: 12 (3.2%)
  Phi3: 48 (12.9%)
  Sonnet: 50 (13.4%)
F1 Macro did not improve from best (0.9020).

===== EPOCH 8/10 =====

--- Training Epoch 8/10 ---


Epoch 8 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 8 Average Training Loss (per batch): 0.0143

--- Epoch 8 Evaluation ---


Epoch 8 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 8 Evaluation Results:
 - Eval Loss (Weighted): 0.7091
 - Accuracy: 0.8817
 - F1 Macro: 0.8893 (Primary Metric)

Validation Prediction distribution:
  Expert: 41 (11.0%)
  Gemini: 48 (12.9%)
  GPT4: 49 (13.2%)
  Llama31405B: 51 (13.7%)
  Llama318B: 34 (9.1%)
  Mistral: 43 (11.6%)
  Novice: 12 (3.2%)
  Phi3: 46 (12.4%)
  Sonnet: 48 (12.9%)
F1 Macro did not improve from best (0.9020).

===== EPOCH 9/10 =====

--- Training Epoch 9/10 ---


Epoch 9 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 9 Average Training Loss (per batch): 0.0086

--- Epoch 9 Evaluation ---


Epoch 9 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 9 Evaluation Results:
 - Eval Loss (Weighted): 0.7371
 - Accuracy: 0.8925
 - F1 Macro: 0.9003 (Primary Metric)

Validation Prediction distribution:
  Expert: 42 (11.3%)
  Gemini: 48 (12.9%)
  GPT4: 46 (12.4%)
  Llama31405B: 52 (14.0%)
  Llama318B: 36 (9.7%)
  Mistral: 42 (11.3%)
  Novice: 12 (3.2%)
  Phi3: 43 (11.6%)
  Sonnet: 51 (13.7%)
F1 Macro did not improve from best (0.9020).

===== EPOCH 10/10 =====

--- Training Epoch 10/10 ---


Epoch 10 Training:   0%|          | 0/1052 [00:00<?, ?it/s]

Epoch 10 Average Training Loss (per batch): 0.0064

--- Epoch 10 Evaluation ---


Epoch 10 Evaluating:   0%|          | 0/93 [00:00<?, ?it/s]

Epoch 10 Evaluation Results:
 - Eval Loss (Weighted): 0.7337
 - Accuracy: 0.8898
 - F1 Macro: 0.8975 (Primary Metric)

Validation Prediction distribution:
  Expert: 41 (11.0%)
  Gemini: 47 (12.6%)
  GPT4: 48 (12.9%)
  Llama31405B: 52 (14.0%)
  Llama318B: 36 (9.7%)
  Mistral: 43 (11.6%)
  Novice: 12 (3.2%)
  Phi3: 44 (11.8%)
  Sonnet: 49 (13.2%)
F1 Macro did not improve from best (0.9020).

Training finished. Best model from epoch 5 saved to bea2025_track5_output_manual/best_model_manual

 TEST SET PREDICTION 
Loading best model from bea2025_track5_output_manual/best_model_manual...
Best model loaded successfully.
Loading and preprocessing test data...
Loaded 191 conversations from /kaggle/input/bae-acl-dataset/mrbench_v3_testset.json
Preprocessed into 1547 examples.
Running inference on test set...


Predicting:   0%|          | 0/387 [00:00<?, ?it/s]


Generated 1547 predictions.

Test prediction distribution:
  Expert: 220 (14.2%)
  Gemini: 212 (13.7%)
  GPT4: 182 (11.8%)
  Llama31405B: 188 (12.2%)
  Llama318B: 171 (11.1%)
  Mistral: 177 (11.4%)
  Novice: 15 (1.0%)
  Phi3: 197 (12.7%)
  Sonnet: 185 (12.0%)

--------------------------------------------------------------------------------
 FORMATTING PREDICTIONS 
--------------------------------------------------------------------------------
Formatting predictions for submission...
Formatted 191 conversations.

--------------------------------------------------------------------------------
 SAVING & CHECKING 
--------------------------------------------------------------------------------
Saving predictions to bea2025_track5_output_manual/predictions.json
Successfully saved bea2025_track5_output_manual/predictions.json

--- Checking Completeness ---
SUCCESS: All 1547 predictions present.

--- Zipping Predictions ---
Zipping predictions to bea2025_track5_output_manual/predictions.js