In [1]:
import accelerate
print("Accelerate version:", accelerate.__version__)

Accelerate version: 0.26.0


In [2]:
import torch
print("Torch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

Torch version: 2.5.1
MPS available: True


In [6]:
# Standard library imports
import re
import json
from pathlib import Path

# Third-party imports
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Datasets and ML frameworks
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)


In [3]:
# Add this import at the top of the cell
import json

# Load the combined training set
with open("combined_train_2nd_last.json", "r") as train_file:
    combined_train = json.load(train_file)

# Load the combined test set
with open("combined_test_2nd_last.json", "r") as test_file:
    combined_test = json.load(test_file)

# Print some information about the loaded data
print(f"Loaded training set size: {len(combined_train)}")
print(f"Loaded test set size: {len(combined_test)}")
print("Sample training example:", combined_train[0])
print("Sample test example:", combined_test[0])

Loaded training set size: 14946
Loaded test set size: 2638
Sample training example: {'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'solution': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'label': 'Correct'}
Sample test example: {'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", 'solution': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18', 'label': 'Correct'}


In [4]:
# 📏 ANALYZE RAW TEXT LENGTHS BEFORE TOKENIZATION
print("=== RAW TEXT LENGTH ANALYSIS ===")

def analyze_text_lengths(dataset, name):
    print(f"\n{name} Dataset Analysis:")
    
    # Separate by label
    correct_examples = [ex for ex in dataset if ex['label'] == 'Correct']
    flawed_examples = [ex for ex in dataset if ex['label'] == 'Flawed']
    
    def get_text_stats(examples, label_name):
        question_lengths = [len(ex['question']) for ex in examples]
        solution_lengths = [len(ex['solution']) for ex in examples]
        # 🔥 FIX: Create combined text outside f-string
        combined_lengths = []
        for ex in examples:
            combined_text = f"Question:\n{ex['question']}\n\nSolution:\n{ex['solution']}"
            combined_lengths.append(len(combined_text))
        
        question_words = [len(ex['question'].split()) for ex in examples]
        solution_words = [len(ex['solution'].split()) for ex in examples]
        # 🔥 FIX: Create combined text outside f-string
        combined_words = []
        for ex in examples:
            combined_text = f"Question:\n{ex['question']}\n\nSolution:\n{ex['solution']}"
            combined_words.append(len(combined_text.split()))
        
        print(f"  {label_name} Examples ({len(examples)} total):")
        print(f"    Question chars: min={min(question_lengths)}, max={max(question_lengths)}, avg={sum(question_lengths)/len(question_lengths):.1f}")
        print(f"    Solution chars: min={min(solution_lengths)}, max={max(solution_lengths)}, avg={sum(solution_lengths)/len(solution_lengths):.1f}")
        print(f"    Combined chars: min={min(combined_lengths)}, max={max(combined_lengths)}, avg={sum(combined_lengths)/len(combined_lengths):.1f}")
        print(f"    Question words: min={min(question_words)}, max={max(question_words)}, avg={sum(question_words)/len(question_words):.1f}")
        print(f"    Solution words: min={min(solution_words)}, max={max(solution_words)}, avg={sum(solution_words)/len(solution_words):.1f}")
        print(f"    Combined words: min={min(combined_words)}, max={max(combined_words)}, avg={sum(combined_words)/len(combined_words):.1f}")
        
        return combined_lengths, combined_words
    
    correct_char_lengths, correct_word_lengths = get_text_stats(correct_examples, "CORRECT")
    flawed_char_lengths, flawed_word_lengths = get_text_stats(flawed_examples, "FLAWED")
    
    # Compare lengths between correct and flawed
    print(f"  COMPARISON:")
    print(f"    Avg chars - Correct: {sum(correct_char_lengths)/len(correct_char_lengths):.1f}, Flawed: {sum(flawed_char_lengths)/len(flawed_char_lengths):.1f}")
    print(f"    Avg words - Correct: {sum(correct_word_lengths)/len(correct_word_lengths):.1f}, Flawed: {sum(flawed_word_lengths)/len(flawed_word_lengths):.1f}")
    
    return correct_char_lengths + flawed_char_lengths, correct_word_lengths + flawed_word_lengths

# Analyze training data
train_char_lengths, train_word_lengths = analyze_text_lengths(combined_train, "TRAINING")

# Analyze test data  
test_char_lengths, test_word_lengths = analyze_text_lengths(combined_test, "TEST")

# Overall statistics
print(f"\n=== OVERALL STATISTICS ===")
print(f"Training set:")
print(f"  Character lengths: min={min(train_char_lengths)}, max={max(train_char_lengths)}, avg={sum(train_char_lengths)/len(train_char_lengths):.1f}")
print(f"  Word lengths: min={min(train_word_lengths)}, max={max(train_word_lengths)}, avg={sum(train_word_lengths)/len(train_word_lengths):.1f}")

print(f"Test set:")
print(f"  Character lengths: min={min(test_char_lengths)}, max={max(test_char_lengths)}, avg={sum(test_char_lengths)/len(test_char_lengths):.1f}")
print(f"  Word lengths: min={min(test_word_lengths)}, max={max(test_word_lengths)}, avg={sum(test_word_lengths)/len(test_word_lengths):.1f}")

# Rough token estimation (1 word ≈ 1.3 tokens for English)
print(f"\n=== ROUGH TOKEN ESTIMATION (words × 1.3) ===")
train_estimated_tokens = [w * 1.3 for w in train_word_lengths]
test_estimated_tokens = [w * 1.3 for w in test_word_lengths]

print(f"Training estimated tokens: min={min(train_estimated_tokens):.0f}, max={max(train_estimated_tokens):.0f}, avg={sum(train_estimated_tokens)/len(train_estimated_tokens):.0f}")
print(f"Test estimated tokens: min={min(test_estimated_tokens):.0f}, max={max(test_estimated_tokens):.0f}, avg={sum(test_estimated_tokens)/len(test_estimated_tokens):.0f}")

# Check how many would exceed 512 tokens
train_over_512 = sum(1 for t in train_estimated_tokens if t > 512)
test_over_512 = sum(1 for t in test_estimated_tokens if t > 512)
print(f"Estimated examples over 512 tokens: Train={train_over_512}/{len(train_estimated_tokens)} ({100*train_over_512/len(train_estimated_tokens):.1f}%), Test={test_over_512}/{len(test_estimated_tokens)} ({100*test_over_512/len(test_estimated_tokens):.1f}%)")

# Show a few examples
print(f"\n=== SAMPLE EXAMPLES ===")
print("Shortest example:")
shortest_idx = train_char_lengths.index(min(train_char_lengths))
shortest_example = combined_train[shortest_idx]
# 🔥 FIX: Create combined text outside f-string
shortest_text = f"Question:\n{shortest_example['question']}\n\nSolution:\n{shortest_example['solution']}"
print(f"  Length: {len(shortest_text)} chars")
print(f"  Question: {shortest_example['question'][:100]}...")
print(f"  Solution: {shortest_example['solution'][:100]}...")

print("\nLongest example:")
longest_idx = train_char_lengths.index(max(train_char_lengths))
longest_example = combined_train[longest_idx]
# 🔥 FIX: Create combined text outside f-string
longest_text = f"Question:\n{longest_example['question']}\n\nSolution:\n{longest_example['solution']}"
print(f"  Length: {len(longest_text)} chars")
print(f"  Question: {longest_example['question'][:100]}...")
print(f"  Solution: {longest_example['solution'][:100]}...")

print("=" * 50)

=== RAW TEXT LENGTH ANALYSIS ===

TRAINING Dataset Analysis:
  CORRECT Examples (7473 total):
    Question chars: min=42, max=985, avg=234.5
    Solution chars: min=50, max=1228, avg=287.5
    Combined chars: min=146, max=1711, avg=544.0
    Question words: min=9, max=183, avg=45.1
    Solution words: min=4, max=216, avg=51.7
    Combined words: min=23, max=336, avg=98.8
  FLAWED Examples (7473 total):
    Question chars: min=42, max=985, avg=234.5
    Solution chars: min=50, max=1228, avg=287.5
    Combined chars: min=146, max=1711, avg=544.1
    Question words: min=9, max=183, avg=45.1
    Solution words: min=4, max=216, avg=51.7
    Combined words: min=23, max=336, avg=98.8
  COMPARISON:
    Avg chars - Correct: 544.0, Flawed: 544.1
    Avg words - Correct: 98.8, Flawed: 98.8

TEST Dataset Analysis:
  CORRECT Examples (1319 total):
    Question chars: min=73, max=848, avg=239.9
    Solution chars: min=48, max=1070, avg=292.9
    Combined chars: min=182, max=1640, avg=554.8
    Quest

In [7]:
from datasets import Dataset
train_dataset = Dataset.from_list(combined_train)
test_dataset = Dataset.from_list(combined_test)

In [8]:
# Preprocessing with tokenizer
from transformers import AutoTokenizer

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🔥 FIX: Add padding token for GPT-2 style models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(example):
    # Combine the question and solution into a single input string
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"
    
    # Tokenize the combined input with truncation, padding, and a max length of 512 tokens
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize_fn)
test_dataset = test_dataset.map(tokenize_fn)

# Label encoding
label_map = {"Correct": 0, "Flawed": 1}
train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

In [9]:
# 🔍 DEBUG: Check what's in your datasets
print("=== DEBUGGING DATASET ===")
print("Train dataset columns:", train_dataset.column_names)
print("Test dataset columns:", test_dataset.column_names)

n_train = len(train_dataset)
n_test = len(test_dataset)

# Check a sample
sample = train_dataset[0]
print("Sample keys:", sample.keys())
print("Input IDs shape:", sample['input_ids'].shape if hasattr(sample['input_ids'], 'shape') else len(sample['input_ids']))
print("Labels:", sample['labels'])

# Decode the input to see what text is actually fed to model
decoded_input = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print("Decoded input text:")
print(decoded_input)
print("=" * 50)

=== DEBUGGING DATASET ===
Train dataset columns: ['question', 'solution', 'label', 'input_ids', 'attention_mask', 'labels']
Test dataset columns: ['question', 'solution', 'label', 'input_ids', 'attention_mask', 'labels']
Sample keys: dict_keys(['question', 'solution', 'label', 'input_ids', 'attention_mask', 'labels'])
Input IDs shape: 512
Labels: 0
Decoded input text:
Question:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Solution:
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


In [10]:
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# AutoTokenizer

Converts text into numbers that neural networks can understand.

### Special Tokens
- `[CLS]` (101): Start of sequence
- `[SEP]` (102): Separator between segments  
- `[PAD]` (0): Padding token
- `[UNK]`: Unknown/out-of-vocabulary words

| Parameter | Purpose |
|-----------|---------|
| `truncation=True` | Cut text if > 512 tokens |
| `padding="max_length"` | Add padding to reach exactly 512 tokens |
| `max_length=512` | Set maximum sequence length |

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [14]:
# 🔥 FIXED AND OPTIMIZED LEARNING RATE FINDER
def find_learning_rate(model, train_dataset, start_lr=1e-7, end_lr=1e-1, num_iter=100):
    """
    Fast learning rate finder - optimized for speed
    """
    print(f"🚀 Starting LR finder with {num_iter} iterations...")
    
    model.train()
    
    # 🔥 FIX 1: Create proper DataLoader with smaller batch size
    temp_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Smaller batch
    
    # 🔥 FIX 2: Use fresh optimizer
    optimizer = AdamW(model.parameters(), lr=start_lr)
    
    lr_mult = (end_lr / start_lr) ** (1 / num_iter)
    lrs, losses = [], []
    best_loss = float("inf")
    
    # 🔥 FIX 3: Proper iteration with early stopping
    batch_count = 0
    for batch in tqdm(temp_dataloader, desc="Finding optimal LR", total=num_iter):
        if batch_count >= num_iter:
            break
            
        # 🔥 FIX 4: Ensure proper device handling
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Track learning rate and loss
        current_lr = optimizer.param_groups[0]['lr']
        lrs.append(current_lr)
        losses.append(loss.item())
        
        # 🔥 FIX 5: Early stopping if loss explodes
        if loss.item() > best_loss * 4:
            print(f"⚠️ Loss exploded at LR {current_lr:.2e}, stopping early")
            break
        best_loss = min(best_loss, loss.item())
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update learning rate
        for param_group in optimizer.param_groups:
            param_group['lr'] *= lr_mult
        
        batch_count += 1
    
    print(f"✅ LR finder completed with {len(lrs)} iterations")
    return lrs, losses

# 🔥 OPTIMIZED CALL - No more list conversion!
print("🔍 Running optimized learning rate finder...")
lrs, losses = find_learning_rate(model, train_dataset, num_iter=50)  # Reduced iterations

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(lrs, losses, 'b-', linewidth=2)
plt.xscale('log')
plt.xlabel('Learning Rate')
plt.ylabel('Loss')
plt.title('Learning Rate Finder (Optimized)')
plt.grid(True, alpha=0.3)
plt.show()

# Find optimal learning rate
if losses:
    min_loss_idx = losses.index(min(losses))
    optimal_lr = lrs[min_loss_idx]
    print(f"📊 Suggested Learning Rate: {optimal_lr:.2e}")
    print(f"💡 Recommended range: {optimal_lr/10:.2e} to {optimal_lr:.2e}")
else:
    print("⚠️ No valid learning rates found")
    optimal_lr = 2e-5  # Default fallback

🔍 Running optimized learning rate finder...
🚀 Starting LR finder with 50 iterations...


Finding optimal LR:   8%|▊         | 4/50 [01:25<16:23, 21.39s/it]


KeyboardInterrupt: 

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [16]:
from transformers import get_scheduler
from torch.optim import AdamW  # ✅ correct in latest versions

learning_rate = 3e-5  # From your LR finder
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


In [17]:
from tqdm import tqdm

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    
    for batch in progress:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"✅ Epoch {epoch+1} finished — Avg loss: {avg_loss:.4f}")


Epoch 1:   0%|          | 2/935 [01:30<11:41:02, 45.08s/it, loss=1.19]


KeyboardInterrupt: 

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        correct += (preds == batch["labels"]).sum().item()
        total += preds.size(0)

print(f"🔍 Test Accuracy: {correct / total:.2%}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Class 0", "Class 1"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.grid(False)
plt.show()


In [None]:
misclassified_indices = []
for i, (pred, label) in enumerate(zip(all_preds, all_labels)):
    if pred != label:
        ex = test_dataset[i]
        misclassified_indices.append(i)
        print(f"❌ Misclassified Example {i}")
        print(f"True: {label}, Pred: {pred}")
        print(tokenizer.decode(ex["input_ids"], skip_special_tokens=True))
        print("-" * 80)
print(misclassified_indices)

In [None]:
def show_test_example(idx):
    ex = test_dataset[idx]
    text = tokenizer.decode(ex["input_ids"], skip_special_tokens=True)
    print(f"🔎 Index: {idx}")
    print(f"True Label: {ex['labels']}")
    print("Decoded Text:")
    print(text)

n_test_original = int(len(test_dataset)/2)

for idx in misclassified_indices:
    if idx < n_test_original :
        print("Original:")
        show_test_example(idx)
        print("\nFlawed:")
        show_test_example(idx + n_test_original)
        print("=" * 80)
    else: 
        print("Original:")
        show_test_example(idx - n_test_original)
        print("\nFlawed:")
        show_test_example(idx)
        print("=" * 80)

## Fine-tuning

#### Epoch

An epoch refers to one complete pass through the entire training dataset. During an epoch, the model processes all the training samples once, updating its weights based on the computed loss. Training for multiple epochs allows the model to learn and refine its parameters iteratively.

Increasing the number of epochs allows the model to learn more but risks overfitting if too high. A balance between batch size and epochs is crucial for optimal performance.

#### Batch size

Batch size determines the number of samples processed before the model updates its weights. For example, a batch size of 8 means 8 samples (e.g., 8 question-answer pairs) are processed together in one forward and backward pass during training.

A smaller batch size uses less memory but may take longer to converge, while a larger batch size can speed up training but requires more memory. 

#### Optimizer - AdamW

AdamW is an optimizer that implements the Adam algorithm with weight decay regularization. It helps prevent overfitting by penalizing large weights, which is particularly useful in deep learning models. AdamW is widely used because it combines the benefits of Adam (adaptive learning rates) with weight decay for better generalization.

Other optimizer choices include:
- **SGD (Stochastic Gradient Descent)**: A simple optimizer with momentum and learning rate decay options.
- **RMSprop**: Designed for non-stationary objectives, often used in RNNs.
- **Adagrad**: Adapts learning rates based on parameter updates, suitable for sparse data.
- **Adadelta**: An extension of Adagrad that reduces aggressive learning rate decay.
- **Adam**: Similar to AdamW but without weight decay.
- **Nadam**: Adam with Nesterov momentum.

#### tqdm

`tqdm` is a Python library used to display progress bars for loops. It provides a visual representation of the progress of an iterable, such as a training loop or data processing, making it easier to monitor the execution time and completion percentage. It is especially useful in long-running tasks.

For example:
```python
from tqdm import tqdm
for i in tqdm(range(100)):
    # Simulate some work
    pass
```

This will display a progress bar in the console, showing the percentage completed, elapsed time, and estimated time remaining.

In [None]:
# ⚡ ULTRA-FAST TRAINING - Just 1 epoch with tiny dataset
print("⚡ Running ULTRA-FAST training for testing...")

# Use only 10% of data for quick test
quick_train_size = len(train_dataset) // 10
quick_train_subset = train_dataset.select(range(quick_train_size))

quick_dataloader = DataLoader(quick_train_subset, batch_size=16, shuffle=True)

model.train()
epoch = 1
total_loss = 0
progress = tqdm(quick_dataloader, desc=f"Quick Test Epoch {epoch}")

for batch in progress:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    progress.set_postfix(loss=loss.item())

avg_loss = total_loss / len(quick_dataloader)
print(f"✅ Quick test completed — Avg loss: {avg_loss:.4f}")

⚡ Running ULTRA-FAST training for testing...


Quick Test Epoch 1: 100%|██████████| 94/94 [1:00:36<00:00, 38.69s/it, loss=0.00266] 

✅ Quick test completed — Avg loss: 0.0043
⏱️  This should take ~2-3 minutes on M2





In [22]:
# 📊 ENHANCED PERFORMANCE EVALUATION
print("\n" + "="*60)
print("📊 COMPREHENSIVE PERFORMANCE ANALYSIS")
print("="*60)

import time
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import numpy as np

# 1. DETAILED METRICS BREAKDOWN
print("\n🎯 DETAILED METRICS BREAKDOWN")
print("-" * 40)

# Calculate per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(all_labels, all_preds, average=None)
accuracy = accuracy_score(all_labels, all_preds)

class_names = ["Correct", "Flawed"]
for i, class_name in enumerate(class_names):
    print(f"{class_name:>8}: Precision={precision[i]:.3f}, Recall={recall[i]:.3f}, F1={f1[i]:.3f}, Support={support[i]}")

print(f"\n📈 Overall Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"📊 Macro Avg F1: {np.mean(f1):.3f}")
print(f"📊 Weighted Avg F1: {np.average(f1, weights=support):.3f}")

# 2. CONFUSION MATRIX ANALYSIS
print("\n🔍 CONFUSION MATRIX ANALYSIS")
print("-" * 40)
cm = confusion_matrix(all_labels, all_preds)
tn, fp, fn, tp = cm.ravel()

print(f"True Negatives (Correct→Correct):  {tn}")
print(f"False Positives (Correct→Flawed):  {fp}")
print(f"False Negatives (Flawed→Correct): {fn}")
print(f"True Positives (Flawed→Flawed):   {tp}")

# Calculate additional metrics
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0

print(f"\n📊 Additional Metrics:")
print(f"   Sensitivity (Recall): {sensitivity:.3f}")
print(f"   Specificity:          {specificity:.3f}")
print(f"   Precision (Positive): {precision_pos:.3f}")

# 3. CLASS DISTRIBUTION ANALYSIS
print("\n📊 CLASS DISTRIBUTION ANALYSIS")
print("-" * 40)
unique_labels, label_counts = np.unique(all_labels, return_counts=True)
unique_preds, pred_counts = np.unique(all_preds, return_counts=True)

print("True Label Distribution:")
for label, count in zip(unique_labels, label_counts):
    class_name = class_names[label]
    percentage = (count / len(all_labels)) * 100
    print(f"   {class_name}: {count} samples ({percentage:.1f}%)")

print("\nPredicted Label Distribution:")
for pred, count in zip(unique_preds, pred_counts):
    class_name = class_names[pred]
    percentage = (count / len(all_preds)) * 100
    print(f"   {class_name}: {count} samples ({percentage:.1f}%)")

# 4. MODEL CONFIDENCE ANALYSIS
print("\n🎯 MODEL CONFIDENCE ANALYSIS")
print("-" * 40)

# Get prediction probabilities
model.eval()
all_probs = []
all_confidence = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = torch.softmax(outputs.logits, dim=-1)
        confidence = torch.max(probs, dim=-1)[0]  # Max probability as confidence
        all_probs.extend(probs.cpu().numpy())
        all_confidence.extend(confidence.cpu().numpy())

all_probs = np.array(all_probs)
all_confidence = np.array(all_confidence)

print(f"Average Confidence: {np.mean(all_confidence):.3f}")
print(f"Confidence Std Dev: {np.std(all_confidence):.3f}")
print(f"Min Confidence: {np.min(all_confidence):.3f}")
print(f"Max Confidence: {np.max(all_confidence):.3f}")

# Confidence by correctness
correct_mask = np.array(all_preds) == np.array(all_labels)
correct_confidence = all_confidence[correct_mask]
incorrect_confidence = all_confidence[~correct_mask]

print(f"\nCorrect Predictions Confidence: {np.mean(correct_confidence):.3f} ± {np.std(correct_confidence):.3f}")
print(f"Incorrect Predictions Confidence: {np.mean(incorrect_confidence):.3f} ± {np.std(incorrect_confidence):.3f}")

# 5. INFERENCE SPEED ANALYSIS
print("\n⚡ INFERENCE SPEED ANALYSIS")
print("-" * 40)

# Time a few batches for speed measurement
model.eval()
batch_times = []
sample_batches = 5

with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        if i >= sample_batches:
            break
        
        batch = {k: v.to(device) for k, v in batch.items()}
        
        start_time = time.time()
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        end_time = time.time()
        
        batch_time = end_time - start_time
        batch_times.append(batch_time)

avg_batch_time = np.mean(batch_times)
samples_per_batch = test_dataloader.batch_size
samples_per_second = samples_per_batch / avg_batch_time

print(f"Average batch time: {avg_batch_time:.4f} seconds")
print(f"Samples per second: {samples_per_second:.1f}")
print(f"Time per sample: {avg_batch_time/samples_per_batch*1000:.2f} ms")

# 6. ERROR ANALYSIS SUMMARY
print("\n❌ ERROR ANALYSIS SUMMARY")
print("-" * 40)

# Count errors by type
total_errors = len(all_labels) - np.sum(np.array(all_preds) == np.array(all_labels))
correct_to_flawed = fp  # False positives
flawed_to_correct = fn  # False negatives

print(f"Total Errors: {total_errors} / {len(all_labels)} ({total_errors/len(all_labels)*100:.1f}%)")
print(f"Correct→Flawed errors: {correct_to_flawed} ({correct_to_flawed/total_errors*100:.1f}% of errors)")
print(f"Flawed→Correct errors: {flawed_to_correct} ({flawed_to_correct/total_errors*100:.1f}% of errors)")

# Low confidence errors
low_confidence_threshold = 0.6
low_confidence_errors = np.sum((all_confidence < low_confidence_threshold) & (~correct_mask))
print(f"Low confidence errors (<{low_confidence_threshold}): {low_confidence_errors}")

# 7. MODEL PERFORMANCE SUMMARY
print("\n🏆 MODEL PERFORMANCE SUMMARY")
print("-" * 40)
print(f"✅ Overall Accuracy: {accuracy*100:.1f}%")
print(f"📊 F1-Score (Macro): {np.mean(f1):.3f}")
print(f"🎯 Precision (Flawed Detection): {precision[1]:.3f}")
print(f"🔍 Recall (Flawed Detection): {recall[1]:.3f}")
print(f"⚡ Inference Speed: {samples_per_second:.1f} samples/sec")
print(f"🤖 Model Confidence: {np.mean(all_confidence):.3f}")

# Performance rating
if accuracy >= 0.9:
    rating = "🌟 EXCELLENT"
elif accuracy >= 0.8:
    rating = "🔥 GOOD"
elif accuracy >= 0.7:
    rating = "⚡ DECENT"
else:
    rating = "⚠️ NEEDS IMPROVEMENT"

print(f"\n🏅 Performance Rating: {rating}")
print("="*60)


📊 COMPREHENSIVE PERFORMANCE ANALYSIS

🎯 DETAILED METRICS BREAKDOWN
----------------------------------------
 Correct: Precision=1.000, Recall=1.000, F1=1.000, Support=96


IndexError: index 1 is out of bounds for axis 0 with size 1

### Notes

Learning rate scheduler

torch.optim.lr_scheduler

https://www.datacamp.com/tutorial/fine-tuning-large-language-models

Increase batch_size 

total_norm_util = clip_grad_norm_(model.parameters(), max_norm=float('inf')) 

weight-decay (decided based on if it is overfitting)

cross validation