In [1]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4


In [None]:
!pip install -q python-chess pandas pillow tqdm opencv-python matplotlib seaborn scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/6.1 MB[0m [31m39.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m5.2/6.1 MB[0m [31m76.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.1/6.1 MB[0m [31m77.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for chess (setup.py) ... [?25l[?25hdone


In [3]:
# //code.zip

from google.colab import files
import zipfile
import os
import shutil

print("Upload code.zip...")
uploaded = files.upload()

with zipfile.ZipFile('code.zip', 'r') as z:
    for member in z.namelist():
        z.extract(member, 'temp')

if os.path.exists('temp'):
    for root, dirs, filelist in os.walk('temp'):
        for f in filelist:
            old_path = os.path.join(root, f)
            new_path = os.path.relpath(old_path, 'temp').replace('\\', '/')
            directory = os.path.dirname(new_path)
            if directory:
                os.makedirs(directory, exist_ok=True)
            shutil.copy(old_path, new_path)
    shutil.rmtree('temp')

print("\nCode uploaded!")
!ls src/ dataset_tools/

Upload code.zip...


Saving code.zip to code.zip

Code uploaded!
dataset_tools/:
board_detect_and_warp.py  extract_squares.py  make_dataset.py
debug_grid.py		  fen_utils.py	      __pycache__
eval.py			  __init__.py	      show_crops.py

src/:
dataset.py  __init__.py  predict.py   train.py
eval.py     model.py	 __pycache__  visualize.py


In [6]:
import glob

print("Upload all_games_data.zip (games 2,4,5,6,7)...")
uploaded = files.upload()

with zipfile.ZipFile(list(uploaded.keys())[0], 'r') as z:
    for member in z.namelist():
        z.extract(member, 'temp')

if os.path.exists('temp'):
    for root, dirs, filelist in os.walk('temp'):
        for f in filelist:
            old_path = os.path.join(root, f)
            new_path = os.path.relpath(old_path, 'temp').replace('\\', '/')
            directory = os.path.dirname(new_path)
            if directory:
                os.makedirs(directory, exist_ok=True)
            shutil.copy(old_path, new_path)
    shutil.rmtree('temp')

print("\n" + "="*60)
print("GAMES UPLOADED:")
print("="*60)
total_frames = 0
for game in sorted(glob.glob('Data/game*_per_frame')):
    game_name = os.path.basename(game)
    frames = len(glob.glob(f'{game}/tagged_images/*.jpg'))
    total_frames += frames
    print(f"  {game_name}: {frames} frames")
print(f"\nTotal: {total_frames} frames")
print("="*60)

Upload all_games_data.zip (games 2,4,5,6,7)...


Saving all_games_data.zip to all_games_data.zip

GAMES UPLOADED:
  game2_per_frame: 77 frames
  game4_per_frame: 184 frames
  game5_per_frame: 109 frames
  game6_per_frame: 92 frames
  game7_per_frame: 55 frames

Total: 517 frames


## 7-Fold Cross-Validation Strategy

### Overview
This notebook implements **7-fold cross-validation** to evaluate cross-game generalization. Each fold:
- **Trains on 6 games** (with 80/20 train/val split within those 6)
- **Tests on the 7th game** (100% of that game used for testing)
- **Starts with fresh pretrained ResNet50 weights**
- **Trains for 8 epochs** with early stopping based on validation accuracy

### Key Features
1. **Cross-Game Evaluation**: Tests if the model can learn from some games and generalize to completely unseen games
2. **Fair Comparison**: Each fold uses the same architecture, hyperparameters, and training procedure
3. **Comprehensive Metrics**: 
   - Per-fold test accuracy (on held-out game)
   - Per-fold validation accuracy (on 20% of training games)
   - Generalization gap (val vs test performance)
   - Per-class accuracy and confusion matrices
4. **Statistical Analysis**: Mean, std, min, max across all 7 folds to understand model stability

### Training Configuration
- **Model**: ResNet50 (pretrained on ImageNet)
- **Epochs**: 8 per fold
- **Batch Size**: 128
- **Learning Rate**: 0.001
- **Optimizer**: Adam
- **Data Augmentation**: Random horizontal flip, color jitter (training only)
- **Input Size**: 224x224 pixels

### Expected Outputs
- `cross_validation_results.csv` - Detailed results for all 7 folds
- `cross_validation_results.png` - Visualizations of performance
- `confusion_matrix_fold_X.png` - Confusion matrices for best/worst folds
- `checkpoints/fold_X/best_model.pth` - Best model checkpoint for each fold

### Interpretation Guide
- **Low test accuracy variance** (< 5%) → Good cross-game generalization
- **Small generalization gap** (< 3%) → Model doesn't overfit to specific games
- **High test accuracy** (> 90%) → Strong performance on unseen games
- **Large accuracy range** (> 20%) → Model is sensitive to specific game characteristics


In [None]:

import pandas as pd
import numpy as np
import glob
import json
import os
from dataset_tools.fen_utils import PIECE_TO_ID, fen_board_to_64_labels, idx_to_square_name

print("="*60)
print("PREPARING 7-FOLD CROSS-VALIDATION")
print("Each fold: Train on 6 games, Test on 1 game")
print("="*60)

os.makedirs('dataset_out', exist_ok=True)

with open('dataset_out/classes.json', 'w') as f:
    json.dump({str(v): k for k, v in PIECE_TO_ID.items()}, f, indent=2)

# Load all games
game_dirs = sorted(glob.glob('Data/*_per_frame'))
game_data = {}

for game_dir in game_dirs:
    game_id = os.path.basename(game_dir)
    csv_file = glob.glob(f'{game_dir}/*.csv')

    if not csv_file:
        continue

    df = pd.read_csv(csv_file[0])
    frame_col = 'from_frame' if 'from_frame' in df.columns else 'frame_id'

    game_rows = []

    for _, r in df.iterrows():
        frame_id = int(r[frame_col])
        fen = r['fen']
        labels = fen_board_to_64_labels(fen)

        frame_path = f'{game_dir}/tagged_images/frame_{frame_id:06d}.jpg'
        if not os.path.exists(frame_path):
            continue

        for sq in range(64):
            game_rows.append({
                'frame_path': frame_path,
                'game_id': game_id,
                'frame_id': frame_id,
                'square_idx': sq,
                'row': sq // 8,
                'col': sq % 8,
                'square_name': idx_to_square_name(sq),
                'label_id': labels[sq],
            })

    game_df = pd.DataFrame(game_rows)
    game_data[game_id] = game_df
    
    n_frames = game_df['frame_id'].nunique()
    n_squares = len(game_df)
    print(f"{game_id}: {n_frames} frames, {n_squares:,} squares")

print(f"\nTotal games loaded: {len(game_data)}")
print("="*60)

# Create 7-fold cross-validation splits
fold_manifests = []

for fold_idx, test_game in enumerate(game_data.keys()):
    print(f"\nFold {fold_idx + 1}: Test on {test_game}")
    
    # Combine 6 games for training (and validation split)
    train_val_dfs = []
    for game_id, game_df in game_data.items():
        if game_id != test_game:
            train_val_dfs.append(game_df.copy())
    
    train_val_df = pd.concat(train_val_dfs, ignore_index=True)
    test_df = game_data[test_game].copy()
    
    # Split training data into 80% train, 20% val
    unique_frames = train_val_df.groupby('game_id')['frame_id'].unique()
    
    train_frames_list = []
    val_frames_list = []
    
    for game_id, frames in unique_frames.items():
        frames = np.array(list(frames))
        n_frames = len(frames)
        
        rng = np.random.RandomState(42)
        rng.shuffle(frames)
        
        n_train = int(0.8 * n_frames)
        train_frames_list.extend([(game_id, f) for f in frames[:n_train]])
        val_frames_list.extend([(game_id, f) for f in frames[n_train:]])
    
    train_frame_set = set(train_frames_list)
    val_frame_set = set(val_frames_list)
    
    def assign_split_train_val(row):
        key = (row['game_id'], row['frame_id'])
        if key in train_frame_set:
            return 'train'
        elif key in val_frame_set:
            return 'val'
        else:
            return None
    
    train_val_df['split'] = train_val_df.apply(assign_split_train_val, axis=1)
    test_df['split'] = 'test'
    
    # Combine and save manifest for this fold
    fold_df = pd.concat([train_val_df, test_df], ignore_index=True)
    fold_df = fold_df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    manifest_path = f'dataset_out/fold_{fold_idx + 1}_manifest.csv'
    fold_df.to_csv(manifest_path, index=False)
    
    fold_manifests.append({
        'fold': fold_idx + 1,
        'test_game': test_game,
        'manifest_path': manifest_path,
        'train_squares': (fold_df['split'] == 'train').sum(),
        'val_squares': (fold_df['split'] == 'val').sum(),
        'test_squares': (fold_df['split'] == 'test').sum(),
        'train_frames': fold_df[fold_df['split'] == 'train']['frame_id'].nunique(),
        'val_frames': fold_df[fold_df['split'] == 'val']['frame_id'].nunique(),
        'test_frames': fold_df[fold_df['split'] == 'test']['frame_id'].nunique(),
    })
    
    print(f"  Train: {fold_manifests[-1]['train_frames']} frames ({fold_manifests[-1]['train_squares']:,} sq)")
    print(f"  Val:   {fold_manifests[-1]['val_frames']} frames ({fold_manifests[-1]['val_squares']:,} sq)")
    print(f"  Test:  {fold_manifests[-1]['test_frames']} frames ({fold_manifests[-1]['test_squares']:,} sq) [from {test_game}]")

# Save fold summary
fold_summary_df = pd.DataFrame(fold_manifests)
fold_summary_df.to_csv('dataset_out/fold_summary.csv', index=False)

print("\n" + "="*60)
print("ALL FOLDS PREPARED")
print("="*60)
print(fold_summary_df.to_string(index=False))
print("="*60)


PER-GAME SPLIT (Each game -> 70/15/15)

game2_per_frame: 77 frames
  Train: 53 frames (3,392 sq)
  Val:   11 frames (704 sq)
  Test:  13 frames (832 sq)

game4_per_frame: 184 frames
  Train: 128 frames (8,192 sq)
  Val:   27 frames (1,728 sq)
  Test:  29 frames (1,856 sq)

game5_per_frame: 109 frames
  Train: 76 frames (4,928 sq)
  Val:   16 frames (1,024 sq)
  Test:  17 frames (1,088 sq)

game6_per_frame: 92 frames
  Train: 64 frames (4,096 sq)
  Val:   13 frames (832 sq)
  Test:  15 frames (960 sq)

game7_per_frame: 55 frames
  Train: 38 frames (2,496 sq)
  Val:   8 frames (512 sq)
  Test:  9 frames (576 sq)

COMBINED DATASET
TRAIN: 23,104 squares, 350 frames
VAL: 4,800 squares, 75 frames
TEST: 5,312 squares, 82 frames

Total: 33,216 squares
All games contribute to all three splits


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import json
import os
from tqdm import tqdm
import time
import datetime
import random
import numpy as np

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

print("="*60)
print("7-FOLD CROSS-VALIDATION TRAINING")
print("="*60)
print(f"Start Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Random Seed: 42 (for reproducibility)")
print("="*60)

# Load class mapping
with open('dataset_out/classes.json', 'r') as f:
    classes = json.load(f)
num_classes = len(classes)
print(f"Number of classes: {num_classes}")
print(f"Classes: {list(classes.values())}")

# Dataset class
class ChessSquareDataset(Dataset):
    def __init__(self, manifest_df, transform=None):
        self.data = manifest_df.reset_index(drop=True)
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['frame_path']
        label = int(row['label_id'])
        
        # Load image
        img = Image.open(img_path).convert('RGB')
        
        # Crop square (assuming 8x8 grid)
        W, H = img.size
        sq_w, sq_h = W // 8, H // 8
        col, row_sq = row['col'], row['row']
        
        left = col * sq_w
        top = row_sq * sq_h
        right = left + sq_w
        bottom = top + sq_h
        
        img_crop = img.crop((left, top, right, bottom))
        
        if self.transform:
            img_crop = self.transform(img_crop)
        
        return img_crop, label

# Data transforms
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Training function
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

# Validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validation", leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Training configuration
EPOCHS = 8
BATCH_SIZE = 128
LR = 0.001
NUM_WORKERS = 2

print(f"\nHyperparameters:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LR}")
print(f"  Optimizer: Adam")
print(f"  LR Scheduler: ReduceLROnPlateau (factor=0.5, patience=2)")
print(f"  Gradient Clipping: max_norm=1.0")

# Results storage
all_results = []

# Load fold summary
fold_summary = pd.read_csv('dataset_out/fold_summary.csv')

# Train each fold
for fold_idx in range(len(fold_summary)):
    fold_info = fold_summary.iloc[fold_idx]
    fold_num = fold_info['fold']
    test_game = fold_info['test_game']
    manifest_path = fold_info['manifest_path']
    
    print("\n" + "="*60)
    print(f"FOLD {fold_num}/7 - Testing on {test_game}")
    print("="*60)
    
    # Load manifest
    manifest_df = pd.read_csv(manifest_path)
    train_df = manifest_df[manifest_df['split'] == 'train']
    val_df = manifest_df[manifest_df['split'] == 'val']
    test_df = manifest_df[manifest_df['split'] == 'test']
    
    print(f"Train: {len(train_df):,} squares ({train_df['frame_id'].nunique()} frames)")
    print(f"Val:   {len(val_df):,} squares ({val_df['frame_id'].nunique()} frames)")
    print(f"Test:  {len(test_df):,} squares ({test_df['frame_id'].nunique()} frames)")
    
    # Create datasets
    train_dataset = ChessSquareDataset(train_df, transform=train_transform)
    val_dataset = ChessSquareDataset(val_df, transform=val_transform)
    test_dataset = ChessSquareDataset(test_df, transform=val_transform)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    
    # Initialize model with fresh weights
    model = models.resnet50(weights='IMAGENET1K_V1')  # Updated to use weights parameter
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model = model.to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)
    # Removed verbose parameter - it's deprecated in newer PyTorch versions
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
    
    # Training loop
    fold_start_time = time.time()
    best_val_acc = 0.0
    best_epoch = 0
    epoch_history = []
    
    for epoch in range(EPOCHS):
        epoch_start_time = time.time()
        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        
        epoch_time = time.time() - epoch_start_time
        
        # Store epoch history
        epoch_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'lr': optimizer.param_groups[0]['lr'],
            'time_sec': epoch_time
        })
        
        print(f"  Train: Loss={train_loss:.4f}, Acc={train_acc:.2f}%")
        print(f"  Val:   Loss={val_loss:.4f}, Acc={val_acc:.2f}%")
        print(f"  Time:  {epoch_time:.1f}s")
        print(f"  LR:    {optimizer.param_groups[0]['lr']:.6f}")
        
        # Update learning rate scheduler
        scheduler.step(val_acc)
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch + 1
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'fold': fold_num
            }, f'dataset_out/best_model_fold_{fold_num}.pth')
            print(f"  ✓ Best model saved (Val Acc: {val_acc:.2f}%)")
    
    # Test on best model
    print(f"\nLoading best model from epoch {best_epoch}...")
    checkpoint = torch.load(f'dataset_out/best_model_fold_{fold_num}.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    test_loss, test_acc = validate(model, test_loader, criterion, device)
    
    fold_time = time.time() - fold_start_time
    
    print(f"\nFold {fold_num} Results:")
    print(f"  Best Val Acc:  {best_val_acc:.2f}% (epoch {best_epoch})")
    print(f"  Test Acc:      {test_acc:.2f}%")
    print(f"  Total Time:    {fold_time/60:.1f} min")
    
    # Store results
    all_results.append({
        'fold': fold_num,
        'test_game': test_game,
        'best_val_acc': best_val_acc,
        'best_epoch': best_epoch,
        'test_acc': test_acc,
        'fold_time_min': fold_time / 60,
        'epoch_history': epoch_history
    })
    
    # Save fold results
    pd.DataFrame(epoch_history).to_csv(f'dataset_out/fold_{fold_num}_history.csv', index=False)

# Final summary
print("\n" + "="*60)
print("7-FOLD CROSS-VALIDATION COMPLETE")
print("="*60)

summary_df = pd.DataFrame([{
    'Fold': r['fold'],
    'Test Game': r['test_game'],
    'Best Val Acc': f"{r['best_val_acc']:.2f}%",
    'Test Acc': f"{r['test_acc']:.2f}%",
    'Best Epoch': r['best_epoch']
} for r in all_results])

print(summary_df.to_string(index=False))

mean_test_acc = np.mean([r['test_acc'] for r in all_results])
std_test_acc = np.std([r['test_acc'] for r in all_results])

print("\n" + "="*60)
print(f"MEAN TEST ACCURACY: {mean_test_acc:.2f}% ± {std_test_acc:.2f}%")
print("="*60)

# Save all results
import pickle
with open('dataset_out/cross_validation_results.pkl', 'wb') as f:
    pickle.dump(all_results, f)

print(f"\nResults saved to dataset_out/cross_validation_results.pkl")
print(f"End Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Using device: cuda

Creating dataloaders...
Caching squares from 359 unique images...
  Cached 20/359 images...
  Cached 40/359 images...
  Cached 60/359 images...
  Cached 80/359 images...
  Cached 100/359 images...
  Cached 120/359 images...
  Cached 140/359 images...
  Cached 160/359 images...
  Cached 180/359 images...
  Cached 200/359 images...
  Cached 220/359 images...
  Cached 240/359 images...
  Cached 260/359 images...
  Cached 280/359 images...
  Cached 300/359 images...
  Cached 320/359 images...
  Cached 340/359 images...
✓ Cached 359 images with 22976 squares
Dataset initialized: 23104 samples, 13 classes
Created train loader: 23104 samples, 180 batches
Dataset initialized: 4800 samples, 13 classes
Created val loader: 4800 samples, 38 batches
Dataset initialized: 5312 samples, 13 classes
Created test loader: 5312 samples, 42 batches

Creating model...
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-06

In [None]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load results
results_df = pd.read_csv('cross_validation_results.csv')

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Test Accuracy per Fold
ax1 = axes[0, 0]
bars = ax1.bar(range(1, 8), results_df['test_acc'], color='steelblue', alpha=0.7, edgecolor='black')
ax1.axhline(results_df['test_acc'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {results_df["test_acc"].mean():.2f}%')
ax1.set_xlabel('Fold (Test Game)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Test Accuracy (%)', fontsize=12, fontweight='bold')
ax1.set_title('Test Accuracy per Fold\n(Cross-Game Generalization)', fontsize=14, fontweight='bold')
ax1.set_xticks(range(1, 8))
ax1.set_xticklabels([f"F{i}\n{results_df.iloc[i-1]['test_game'].replace('game', 'G').replace('_per_frame', '')}" for i in range(1, 8)], fontsize=9)
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5, f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

# Plot 2: Val vs Test Accuracy
ax2 = axes[0, 1]
x = np.arange(7)
width = 0.35
bars1 = ax2.bar(x - width/2, results_df['best_val_acc'], width, label='Validation', color='green', alpha=0.7, edgecolor='black')
bars2 = ax2.bar(x + width/2, results_df['test_acc'], width, label='Test', color='orange', alpha=0.7, edgecolor='black')
ax2.set_xlabel('Fold', fontsize=12, fontweight='bold')
ax2.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
ax2.set_title('Validation vs Test Accuracy\n(Generalization Gap)', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels([f'F{i+1}' for i in range(7)])
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Plot 3: Training Time per Fold
ax3 = axes[1, 0]
bars = ax3.bar(range(1, 8), results_df['fold_time_min'], color='coral', alpha=0.7, edgecolor='black')
ax3.set_xlabel('Fold', fontsize=12, fontweight='bold')
ax3.set_ylabel('Time (minutes)', fontsize=12, fontweight='bold')
ax3.set_title('Training Time per Fold\n(8 Epochs)', fontsize=14, fontweight='bold')
ax3.set_xticks(range(1, 8))
ax3.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.2, f'{height:.1f}m', ha='center', va='bottom', fontsize=9)

# Plot 4: Data Distribution
ax4 = axes[1, 1]
x = np.arange(7)
width = 0.25
bars1 = ax4.bar(x - width, results_df['train_samples']/1000, width, label='Train', color='blue', alpha=0.7, edgecolor='black')
bars2 = ax4.bar(x, results_df['val_samples']/1000, width, label='Val', color='green', alpha=0.7, edgecolor='black')
bars3 = ax4.bar(x + width, results_df['test_samples']/1000, width, label='Test', color='red', alpha=0.7, edgecolor='black')
ax4.set_xlabel('Fold', fontsize=12, fontweight='bold')
ax4.set_ylabel('Samples (thousands)', fontsize=12, fontweight='bold')
ax4.set_title('Data Distribution per Fold\n(Train/Val/Test Split)', fontsize=14, fontweight='bold')
ax4.set_xticks(x)
ax4.set_xticklabels([f'F{i+1}' for i in range(7)])
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('cross_validation_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*60)
print("CROSS-GAME GENERALIZATION ANALYSIS")
print("="*60)

# Calculate generalization gap (val - test)
results_df['gen_gap'] = results_df['best_val_acc'] - results_df['test_acc']

print("\nGeneralization Gap (Val Acc - Test Acc):")
for _, row in results_df.iterrows():
    gap_str = f"+{row['gen_gap']:.2f}%" if row['gen_gap'] > 0 else f"{row['gen_gap']:.2f}%"
    status = "Good" if abs(row['gen_gap']) < 5 else "Fair" if abs(row['gen_gap']) < 10 else "Poor"
    print(f"  Fold {int(row['fold'])} ({row['test_game']}): {gap_str} [{status}]")

print(f"\nMean Gap: {results_df['gen_gap'].mean():.2f}%")
print(f"Std Gap:  {results_df['gen_gap'].std():.2f}%")

print("\n" + "="*60)
print("KEY INSIGHTS:")
print("="*60)
print(f"1. Cross-Game Performance: {results_df['test_acc'].mean():.2f}% ± {results_df['test_acc'].std():.2f}%")
print(f"   → {'Low' if results_df['test_acc'].std() < 5 else 'Moderate' if results_df['test_acc'].std() < 10 else 'High'} variance suggests {'consistent' if results_df['test_acc'].std() < 5 else 'moderate' if results_df['test_acc'].std() < 10 else 'inconsistent'} generalization")

print(f"\n2. Generalization Gap: {results_df['gen_gap'].mean():.2f}%")
print(f"   → {'Excellent' if abs(results_df['gen_gap'].mean()) < 3 else 'Good' if abs(results_df['gen_gap'].mean()) < 5 else 'Fair'} transfer from validation to test")

print(f"\n3. Best Performing Game: {results_df.loc[results_df['test_acc'].idxmax(), 'test_game']}")
print(f"   Test Acc: {results_df['test_acc'].max():.2f}%")

print(f"\n4. Worst Performing Game: {results_df.loc[results_df['test_acc'].idxmin(), 'test_game']}")
print(f"   Test Acc: {results_df['test_acc'].min():.2f}%")

accuracy_range = results_df['test_acc'].max() - results_df['test_acc'].min()
print(f"\n5. Accuracy Range: {accuracy_range:.2f}%")
print(f"   → {'Stable' if accuracy_range < 10 else 'Moderate' if accuracy_range < 20 else 'Unstable'} across different games")

print("\n" + "="*60)
print("RECOMMENDATIONS:")
print("="*60)
if results_df['test_acc'].std() > 10:
    print("• High variance: Consider data augmentation or more training epochs")
if results_df['gen_gap'].mean() > 5:
    print("• Large gap: Model may be overfitting to validation games")
if results_df['test_acc'].mean() < 80:
    print("• Low accuracy: Try different architectures or hyperparameters")
else:
    print("• Model shows good cross-game generalization!")
print("="*60)

# Per-game detailed analysis
print("\n" + "="*60)
print("PER-GAME DETAILED RESULTS:")
print("="*60)
for _, row in results_df.iterrows():
    print(f"\nFold {int(row['fold'])} - Test Game: {row['test_game']}")
    print(f"  Training:   {int(row['train_samples']):,} samples from 6 games")
    print(f"  Validation: {int(row['val_samples']):,} samples (20% of 6 games)")
    print(f"  Test:       {int(row['test_samples']):,} samples (100% of {row['test_game']})")
    print(f"  Best Val Acc: {row['best_val_acc']:.2f}%")
    print(f"  Test Acc:     {row['test_acc']:.2f}%")
    print(f"  Test Loss:    {row['test_loss']:.4f}")
    print(f"  Training Time: {row['fold_time_min']:.1f} minutes")
    print(f"  Generalization: {'✓ Good' if abs(row['gen_gap']) < 5 else '⚠ Fair' if abs(row['gen_gap']) < 10 else '✗ Poor'}")

print("\n" + "="*60)


In [None]:

# Optional: Per-class accuracy analysis for best and worst folds
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import json
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

print("="*60)
print("DETAILED PER-CLASS ANALYSIS")
print("="*60)

# Load classes
with open('dataset_out/classes.json', 'r') as f:
    classes = json.load(f)
num_classes = len(classes)
class_names = [classes[str(i)] for i in range(num_classes)]

# Load results
results_df = pd.read_csv('cross_validation_results.csv')

# Analyze best and worst fold
best_fold = int(results_df.loc[results_df['test_acc'].idxmax(), 'fold'])
worst_fold = int(results_df.loc[results_df['test_acc'].idxmin(), 'fold'])

print(f"\nAnalyzing:")
print(f"  Best Fold:  {best_fold} (Test Acc: {results_df.iloc[best_fold-1]['test_acc']:.2f}%)")
print(f"  Worst Fold: {worst_fold} (Test Acc: {results_df.iloc[worst_fold-1]['test_acc']:.2f}%)")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def analyze_fold(fold_num):
    print(f"\n{'='*60}")
    print(f"FOLD {fold_num} ANALYSIS")
    print(f"{'='*60}")
    
    # Load manifest
    manifest_path = f'dataset_out/fold_{fold_num}_manifest.csv'
    manifest_df = pd.read_csv(manifest_path)
    test_df = manifest_df[manifest_df['split'] == 'test']
    
    # Load model
    model = models.resnet50(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model.load_state_dict(torch.load(f'checkpoints/fold_{fold_num}/best_model.pth'))
    model = model.to(device)
    model.eval()
    
    # Create dataset
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    class ChessSquareDataset(Dataset):
        def __init__(self, manifest_df, transform=None):
            self.data = manifest_df.reset_index(drop=True)
            self.transform = transform
        
        def __len__(self):
            return len(self.data)
        
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            img_path = row['frame_path']
            label = int(row['label_id'])
            
            img = Image.open(img_path).convert('RGB')
            W, H = img.size
            sq_w, sq_h = W // 8, H // 8
            col, row_sq = row['col'], row['row']
            
            left = col * sq_w
            top = row_sq * sq_h
            right = left + sq_w
            bottom = top + sq_h
            
            img_crop = img.crop((left, top, right, bottom))
            
            if self.transform:
                img_crop = self.transform(img_crop)
            
            return img_crop, label
    
    test_dataset = ChessSquareDataset(test_df, transform=val_transform)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
    
    # Get predictions
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Classification report
    print("\nClassification Report:")
    report = classification_report(all_labels, all_preds, target_names=class_names, zero_division=0)
    print(report)
    
    # Per-class accuracy
    print("\nPer-Class Accuracy:")
    for i, class_name in enumerate(class_names):
        mask = all_labels == i
        if mask.sum() > 0:
            acc = (all_preds[mask] == all_labels[mask]).sum() / mask.sum() * 100
            count = mask.sum()
            print(f"  {class_name:15s}: {acc:5.2f}% ({count:,} samples)")
    
    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - Fold {fold_num}\n(Test on {results_df.iloc[fold_num-1]["test_game"]})', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_fold_{fold_num}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Most confused pairs
    print("\nMost Confused Class Pairs:")
    confused_pairs = []
    for i in range(num_classes):
        for j in range(num_classes):
            if i != j and cm[i, j] > 0:
                confused_pairs.append((class_names[i], class_names[j], cm[i, j]))
    
    confused_pairs.sort(key=lambda x: x[2], reverse=True)
    for true_class, pred_class, count in confused_pairs[:10]:
        print(f"  {true_class:15s} → {pred_class:15s}: {count:4d} times")

# Analyze both folds
analyze_fold(best_fold)
analyze_fold(worst_fold)

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
print("\nGenerated files:")
print("  - cross_validation_results.csv")
print("  - cross_validation_results.png")
print(f"  - confusion_matrix_fold_{best_fold}.png")
print(f"  - confusion_matrix_fold_{worst_fold}.png")
print("  - checkpoints/fold_X/best_model.pth (for each fold)")


In [None]:
# python dataset_tools/eval.py --manifest dataset_out/dataset_manifest.csv --preds path/to/preds.csv

In [None]:
# python -m dataset_tools.make_dataset --data_root Data --out_root dataset_out
# Adjust splits/seed: --train_ratio 0.75 --val_ratio 0.05 --seed 1234