# NLP Final Project: Dataset Cartography for Artifact Mitigation
## Fast GPU Training in Google Colab

This notebook runs the complete training pipeline using GPU acceleration for fast results.

## 1. Setup Environment

In [20]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

CUDA available: True
GPU: Tesla T4
Memory: 15.8 GB


In [21]:
# Install required packages
!pip install datasets transformers torch evaluate matplotlib seaborn scipy



In [22]:
# Clone repository
!git clone https://github.com/agsilver108/nlp-fa25-final-project.git
%cd nlp-fa25-final-project

Cloning into 'nlp-fa25-final-project'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 48 (delta 7), reused 47 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (48/48), 78.74 KiB | 3.94 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/nlp-fa25-final-project/nlp-fa25-final-project


## 2. Run Fast Training

In [29]:
# Run the complete training pipeline
exec(open('colab_training.py').read())

üöÄ Starting Colab GPU Training...
Device: cuda
GPU: Tesla T4
Memory: 15.8 GB
üì¶ Loading model and tokenizer...
üìä Loading SQuAD dataset...
Training samples: 10000
Evaluation samples: 1000
üîÑ Preprocessing datasets...
Preprocessing completed in 0.0s


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
#!/usr/bin/env python3
"""
Colab Training Script for NLP Final Project
Optimized for GPU training with baseline and cartography mitigation.
"""

import os
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    set_seed,
    Trainer # Import base Trainer
)
from datasets import load_dataset
import time
import collections
from tqdm.auto import tqdm

# Import our custom modules (will remove custom trainers)
from helpers import prepare_train_dataset_qa, prepare_validation_dataset_qa, postprocess_qa_predictions
from train_with_cartography import CartographyWeightedTrainer # Keep for mitigated training if needed
# from train_with_cartography import load_cartography_weights # Move load_cartography_weights import inside the if block

def run_colab_training():
    """Run fast GPU training in Colab environment."""

    print("üöÄ Starting Colab GPU Training...")

    # Check GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

    # Set seed for reproducibility
    set_seed(42)

    # Configuration
    model_name = "google/electra-small-discriminator"

    # Load model and tokenizer
    print("üì¶ Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    print("üìä Loading SQuAD dataset...")
    dataset = load_dataset("squad")

    # Prepare datasets with reasonable subset for fast training
    train_dataset = dataset['train'].select(range(10000))  # 10K for speed
    eval_dataset = dataset['validation'].select(range(1000))   # 1K for eval

    print(f"Training samples: {len(train_dataset)}")
    print(f"Evaluation samples: {len(eval_dataset)}")

    # Preprocessing functions
    def prepare_train_dataset(examples):
        return prepare_train_dataset_qa(examples, tokenizer)

    def prepare_eval_dataset(examples):
        return prepare_validation_dataset_qa(examples, tokenizer)

    # Preprocess datasets
    print("üîÑ Preprocessing datasets...")
    start_time = time.time()

    train_dataset_processed = train_dataset.map(
        prepare_train_dataset,
        batched=True,
        remove_columns=train_dataset.column_names
    )

    eval_dataset_processed = eval_dataset.map(
        prepare_eval_dataset,
        batched=True,
        remove_columns=eval_dataset.column_names
    )

    print(f"Preprocessing completed in {time.time() - start_time:.1f}s")

    # Training configurations
    base_training_args = TrainingArguments(
        output_dir="/content/baseline_model",
        num_train_epochs=2,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        learning_rate=3e-5,
        warmup_steps=200,
        logging_steps=50,
        eval_strategy="epoch", # Corrected argument name
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        fp16=True,  # Enable mixed precision
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
        save_total_limit=2,
        report_to=[],
        seed=42,
    )

    # Define compute_metrics function for evaluation
    # This function will receive EvalPrediction object 'p'
    def compute_metrics(p):
        n_best_size = 20
        # max_answer_length is handled within postprocess_qa_predictions using QA_MAX_ANSWER_LENGTH
        start_logits, end_logits = p.predictions

        predictions = postprocess_qa_predictions(
            examples=eval_dataset, # Access eval_dataset from outer scope
            features=eval_dataset_processed, # Access eval_dataset_processed from outer scope
            predictions=(start_logits, end_logits),
            n_best_size=n_best_size,
        )

        # Compute the metrics
        from evaluate import load
        metric = load("squad")

        # Format predictions and references for the metric
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_dataset]

        # Compute metrics and return with 'eval_' prefix
        qa_metrics = metric.compute(predictions=formatted_predictions, references=references)
        return {f"eval_{k}": v for k, v in qa_metrics.items()}


    # 1. Train baseline model
    print("\nüéØ Training Baseline Model...")
    baseline_model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Use the base Trainer class
    baseline_trainer = Trainer(
        model=baseline_model,
        args=base_training_args,
        train_dataset=train_dataset_processed,
        eval_dataset=eval_dataset_processed,
        compute_metrics=compute_metrics # Pass the compute_metrics function directly
    )

    baseline_start = time.time()
    baseline_trainer.train()
    baseline_time = time.time() - baseline_start

    # Evaluate baseline
    baseline_results = baseline_trainer.evaluate()
    print(f"‚úÖ Baseline training completed in {baseline_time:.1f}s")
    # Access results with 'eval_' prefix
    print(f"Baseline Results: EM={baseline_results.get('eval_exact_match', 0):.3f}, F1={baseline_results.get('eval_f1', 0):.3f}")


    # 2. Train cartography-mitigated model
    print("\nüó∫Ô∏è Training Cartography-Mitigated Model...")

    # Load cartography weights
    weights_path = "/content/nlp-fa25-final-project/results/cartography/training_weights_upweight_hard.json"
    if os.path.exists(weights_path):
        # Import load_cartography_weights here
        from train_with_cartography import load_cartography_weights
        cartography_weights = load_cartography_weights(weights_path)

        cartography_training_args = TrainingArguments(
            output_dir="/content/cartography_model",
            num_train_epochs=2,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            learning_rate=3e-5,
            warmup_steps=200,
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            fp16=True,
            dataloader_pin_memory=True,
            dataloader_num_workers=2,
            save_total_limit=2,
            report_to=[],
            seed=42,
        )

        cartography_model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        # Use CartographyWeightedTrainer if weighting is needed, otherwise use base Trainer
        # Assuming CartographyWeightedTrainer handles weights internally or through a custom loss
        # For now, let's use CartographyWeightedTrainer as it was in the original script structure
        # If this causes issues, we might need to revisit the weighting implementation.
        cartography_trainer = CartographyWeightedTrainer(
            model=cartography_model,
            args=cartography_training_args,
            train_dataset=train_dataset_processed,
            eval_dataset=eval_dataset_processed,
            compute_metrics=compute_metrics, # Pass the compute_metrics function directly
            cartography_weights=cartography_weights # Pass weights to custom trainer
        )

        cartography_start = time.time()
        cartography_trainer.train()
        cartography_time = time.time() - cartography_start

        # Evaluate cartography model
        cartography_results = cartography_trainer.evaluate()
        print(f"‚úÖ Cartography training completed in {cartography_time:.1f}s")
        # Access results with 'eval_' prefix
        print(f"Cartography Results: EM={cartography_results.get('eval_exact_match', 0):.3f}, F1={cartography_results.get('eval_f1', 0):.3f}")

        # Compare results
        print("\nüìä Comparison:")
        print(f"Baseline:    EM={baseline_results.get('eval_exact_match', 0):.3f}, F1={baseline_results.get('eval_f1', 0):.3f}")
        print(f"Cartography: EM={cartography_results.get('eval_exact_match', 0):.3f}, F1={cartography_results.get('eval_f1', 0):.3f}")

        # Save results
        results = {
            "baseline": {
                "exact_match": baseline_results.get('eval_exact_match', 0),
                "f1": baseline_results.get('eval_f1', 0),
                "training_time": baseline_time
            },
            "cartography": {
                "exact_match": cartography_results.get('eval_exact_match', 0),
                "f1": cartography_results.get('eval_f1', 0),
                "training_time": cartography_time
            },
            "improvement": {
                "em_diff": cartography_results.get('eval_exact_match', 0) - baseline_results.get('eval_exact_match', 0),
                "f1_diff": cartography_results.get('eval_f1', 0) - baseline_results.get('eval_f1', 0)
            }
        }

        with open("/content/colab_training_results.json", "w") as f:
            json.dump(results, f, indent=2)

        print("\n‚úÖ Training complete! Results saved to /content/colab_training_results.json")

    else:
        print("‚ö†Ô∏è  Cartography weights not found, skipping mitigated training")

if __name__ == "__main__":
    run_colab_training()

üöÄ Starting Colab GPU Training...
Device: cuda
GPU: Tesla T4
Memory: 15.8 GB
üì¶ Loading model and tokenizer...
üìä Loading SQuAD dataset...
Training samples: 10000
Evaluation samples: 1000
üîÑ Preprocessing datasets...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preprocessing completed in 10.2s

üéØ Training Baseline Model...


Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [14]:
# Read and display the content of helpers.py
with open('helpers.py', 'r') as f:
    helpers_code = f.read()

print(helpers_code)

import numpy as np
import collections
from collections import defaultdict, OrderedDict
from transformers import Trainer, EvalPrediction
from transformers.trainer_utils import PredictionOutput
from typing import Tuple
from tqdm.auto import tqdm

QA_MAX_ANSWER_LENGTH = 30


# This function preprocesses an NLI dataset, tokenizing premises and hypotheses.
def prepare_dataset_nli(examples, tokenizer, max_seq_length=None):
    max_seq_length = tokenizer.model_max_length if max_seq_length is None else max_seq_length

    tokenized_examples = tokenizer(
        examples['premise'],
        examples['hypothesis'],
        truncation=True,
        max_length=max_seq_length,
        padding='max_length'
    )

    tokenized_examples['label'] = examples['label']
    return tokenized_examples


# This function computes sentence-classification accuracy.
# Functions with signatures like this one work as the "compute_metrics" argument of transformers.Trainer.
def compute_accuracy(eval_preds: EvalPredi

In [6]:
# Read the content of the file
with open('colab_training.py', 'r') as f:
    colab_training_code = f.read()

# Replace the incorrect argument
colab_training_code = colab_training_code.replace("evaluation_strategy='epoch'", "eval_strategy='epoch'")

# Write the modified content back to the file
with open('colab_training.py', 'w') as f:
    f.write(colab_training_code)

print("Updated colab_training.py with 'eval_strategy' instead of 'evaluation_strategy'.")

Updated colab_training.py with 'eval_strategy' instead of 'evaluation_strategy'.


## 3. View Results

In [None]:
# Load and display results
import json
with open('/content/colab_training_results.json', 'r') as f:
    results = json.load(f)

print("üéØ Training Results Summary:")
print(f"\nBaseline Model:")
print(f"  Exact Match: {results['baseline']['exact_match']:.3f}")
print(f"  F1 Score: {results['baseline']['f1']:.3f}")
print(f"  Training Time: {results['baseline']['training_time']:.1f}s")

print(f"\nCartography Model:")
print(f"  Exact Match: {results['cartography']['exact_match']:.3f}")
print(f"  F1 Score: {results['cartography']['f1']:.3f}")
print(f"  Training Time: {results['cartography']['training_time']:.1f}s")

print(f"\nImprovement:")
print(f"  EM Diff: {results['improvement']['em_diff']:+.3f}")
print(f"  F1 Diff: {results['improvement']['f1_diff']:+.3f}")

## 4. Download Results

In [None]:
# Download trained models and results
from google.colab import files

# Zip results for download
!zip -r colab_results.zip /content/baseline_model /content/cartography_model /content/colab_training_results.json
files.download('colab_results.zip')