In [1]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4


In [None]:
!pip install -q python-chess pandas pillow tqdm opencv-python matplotlib seaborn scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/6.1 MB[0m [31m39.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m5.2/6.1 MB[0m [31m76.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.1/6.1 MB[0m [31m77.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for chess (setup.py) ... [?25l[?25hdone


In [3]:
# //code.zip

from google.colab import files
import zipfile
import os
import shutil

print("Upload code.zip...")
uploaded = files.upload()

with zipfile.ZipFile('code.zip', 'r') as z:
    for member in z.namelist():
        z.extract(member, 'temp')

if os.path.exists('temp'):
    for root, dirs, filelist in os.walk('temp'):
        for f in filelist:
            old_path = os.path.join(root, f)
            new_path = os.path.relpath(old_path, 'temp').replace('\\', '/')
            directory = os.path.dirname(new_path)
            if directory:
                os.makedirs(directory, exist_ok=True)
            shutil.copy(old_path, new_path)
    shutil.rmtree('temp')

print("\nCode uploaded!")
!ls src/ dataset_tools/

Upload code.zip...


Saving code.zip to code.zip

Code uploaded!
dataset_tools/:
board_detect_and_warp.py  extract_squares.py  make_dataset.py
debug_grid.py		  fen_utils.py	      __pycache__
eval.py			  __init__.py	      show_crops.py

src/:
dataset.py  __init__.py  predict.py   train.py
eval.py     model.py	 __pycache__  visualize.py


In [6]:
import glob

print("Upload all_games_data.zip (games 2,4,5,6,7)...")
uploaded = files.upload()

with zipfile.ZipFile(list(uploaded.keys())[0], 'r') as z:
    for member in z.namelist():
        z.extract(member, 'temp')

if os.path.exists('temp'):
    for root, dirs, filelist in os.walk('temp'):
        for f in filelist:
            old_path = os.path.join(root, f)
            new_path = os.path.relpath(old_path, 'temp').replace('\\', '/')
            directory = os.path.dirname(new_path)
            if directory:
                os.makedirs(directory, exist_ok=True)
            shutil.copy(old_path, new_path)
    shutil.rmtree('temp')

print("\n" + "="*60)
print("GAMES UPLOADED:")
print("="*60)
total_frames = 0
for game in sorted(glob.glob('Data/game*_per_frame')):
    game_name = os.path.basename(game)
    frames = len(glob.glob(f'{game}/tagged_images/*.jpg'))
    total_frames += frames
    print(f"  {game_name}: {frames} frames")
print(f"\nTotal: {total_frames} frames")
print("="*60)

Upload all_games_data.zip (games 2,4,5,6,7)...


Saving all_games_data.zip to all_games_data.zip

GAMES UPLOADED:
  game2_per_frame: 77 frames
  game4_per_frame: 184 frames
  game5_per_frame: 109 frames
  game6_per_frame: 92 frames
  game7_per_frame: 55 frames

Total: 517 frames


## 2-Fold Cross-Validation + Final Training Strategy

### Overview
This notebook implements a **2-fold cross-validation** to prove learning capability, followed by **training on all 5 games** for the final production model.

### Phase 1: 2-Fold Cross-Validation (Proof of Learning)
Each fold:
- **Trains on 4 games** (with 80/20 train/val split within those 4)
- **Tests on the 5th game** (100% of that game used for testing)
- **Starts with fresh pretrained ResNet50 weights**
- **Trains for 8 epochs** with early stopping based on validation accuracy

**Fold 1**: Test on game7_per_frame  
**Fold 2**: Test on game5_per_frame

### Phase 2: Final Training (All Games)
- **Trains on ALL 5 games** with 80/20 train/val split
- **8 epochs** with same hyperparameters
- **Produces final production model** for deployment

### Key Features
1. **Cross-Game Evaluation**: Proves the model can generalize to completely unseen games
2. **Fair Comparison**: Each fold uses the same architecture and hyperparameters
3. **Production Model**: Final training on all data for best real-world performance

### Training Configuration
- **Model**: ResNet50 (pretrained on ImageNet)
- **Epochs**: 8 per fold
- **Batch Size**: 128
- **Learning Rate**: 0.001
- **Optimizer**: Adam
- **Data Augmentation**: Random horizontal flip, rotation, color jitter (training only)
- **Input Size**: 224x224 pixels

In [None]:
import pandas as pd
import numpy as np
import glob
import json
import os
from dataset_tools.fen_utils import PIECE_TO_ID, fen_board_to_64_labels, idx_to_square_name

print("="*60)
print("PREPARING 2-FOLD CROSS-VALIDATION + ALL GAMES TRAINING")
print("="*60)

os.makedirs('dataset_out', exist_ok=True)

with open('dataset_out/classes.json', 'w') as f:
    json.dump({str(v): k for k, v in PIECE_TO_ID.items()}, f, indent=2)

# Load all games
game_dirs = sorted(glob.glob('Data/*_per_frame'))
game_data = {}

for game_dir in game_dirs:
    game_id = os.path.basename(game_dir)
    csv_file = glob.glob(f'{game_dir}/*.csv')

    if not csv_file:
        continue

    df = pd.read_csv(csv_file[0])
    frame_col = 'from_frame' if 'from_frame' in df.columns else 'frame_id'

    game_rows = []

    for _, r in df.iterrows():
        frame_id = int(r[frame_col])
        fen = r['fen']
        labels = fen_board_to_64_labels(fen)

        frame_path = f'{game_dir}/tagged_images/frame_{frame_id:06d}.jpg'
        if not os.path.exists(frame_path):
            continue

        for sq in range(64):
            game_rows.append({
                'frame_path': frame_path,
                'game_id': game_id,
                'frame_id': frame_id,
                'square_idx': sq,
                'row': sq // 8,
                'col': sq % 8,
                'square_name': idx_to_square_name(sq),
                'label_id': labels[sq],
            })

    game_df = pd.DataFrame(game_rows)
    game_data[game_id] = game_df
    
    n_frames = game_df['frame_id'].nunique()
    n_squares = len(game_df)
    print(f"{game_id}: {n_frames} frames, {n_squares:,} squares")

print(f"\nTotal games loaded: {len(game_data)}")
print("="*60)

# Phase 1: Create 2-fold cross-validation splits
# Fold 1: Test on game7, Fold 2: Test on game5
test_games_for_folds = [
    sorted(game_data.keys())[-1],  # Last game alphabetically (game7)
    sorted(game_data.keys())[2]    # Third game alphabetically (game5)
]

fold_manifests = []

for fold_idx, test_game in enumerate(test_games_for_folds):
    print(f"\nFold {fold_idx + 1}: Test on {test_game}")
    
    # Combine 4 games for training (and validation split)
    train_val_dfs = []
    for game_id, game_df in game_data.items():
        if game_id != test_game:
            train_val_dfs.append(game_df.copy())
    
    train_val_df = pd.concat(train_val_dfs, ignore_index=True)
    test_df = game_data[test_game].copy()
    
    # Split training data into 80% train, 20% val
    unique_frames = train_val_df.groupby('game_id')['frame_id'].unique()
    
    train_frames_list = []
    val_frames_list = []
    
    for game_id, frames in unique_frames.items():
        frames = np.array(list(frames))
        n_frames = len(frames)
        
        rng = np.random.RandomState(42)
        rng.shuffle(frames)
        
        n_train = int(0.8 * n_frames)
        train_frames_list.extend([(game_id, f) for f in frames[:n_train]])
        val_frames_list.extend([(game_id, f) for f in frames[n_train:]])
    
    train_frame_set = set(train_frames_list)
    val_frame_set = set(val_frames_list)
    
    def assign_split_train_val(row):
        key = (row['game_id'], row['frame_id'])
        if key in train_frame_set:
            return 'train'
        elif key in val_frame_set:
            return 'val'
        else:
            return None
    
    train_val_df['split'] = train_val_df.apply(assign_split_train_val, axis=1)
    test_df['split'] = 'test'
    
    # Combine and save manifest for this fold
    fold_df = pd.concat([train_val_df, test_df], ignore_index=True)
    fold_df = fold_df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    manifest_path = f'dataset_out/fold_{fold_idx + 1}_manifest.csv'
    fold_df.to_csv(manifest_path, index=False)
    
    fold_manifests.append({
        'fold': fold_idx + 1,
        'test_game': test_game,
        'manifest_path': manifest_path,
        'train_squares': (fold_df['split'] == 'train').sum(),
        'val_squares': (fold_df['split'] == 'val').sum(),
        'test_squares': (fold_df['split'] == 'test').sum(),
        'train_frames': fold_df[fold_df['split'] == 'train']['frame_id'].nunique(),
        'val_frames': fold_df[fold_df['split'] == 'val']['frame_id'].nunique(),
        'test_frames': fold_df[fold_df['split'] == 'test']['frame_id'].nunique(),
    })
    
    print(f"  Train: {fold_manifests[-1]['train_frames']} frames ({fold_manifests[-1]['train_squares']:,} sq)")
    print(f"  Val:   {fold_manifests[-1]['val_frames']} frames ({fold_manifests[-1]['val_squares']:,} sq)")
    print(f"  Test:  {fold_manifests[-1]['test_frames']} frames ({fold_manifests[-1]['test_squares']:,} sq) [from {test_game}]")

# Save fold summary
fold_summary_df = pd.DataFrame(fold_manifests)
fold_summary_df.to_csv('dataset_out/fold_summary.csv', index=False)

print("\n" + "="*60)
print("2 FOLDS PREPARED (PROOF OF LEARNING)")
print("="*60)
print(fold_summary_df.to_string(index=False))
print("="*60)

# Phase 2: Create manifest for training on ALL 5 games
print("\n" + "="*60)
print("PREPARING ALL GAMES MANIFEST (FINAL TRAINING)")
print("="*60)

# Combine all games
all_games_dfs = [df.copy() for df in game_data.values()]
all_games_df = pd.concat(all_games_dfs, ignore_index=True)

# Split into 80% train, 20% val (no test set for final training)
unique_frames = all_games_df.groupby('game_id')['frame_id'].unique()

train_frames_list = []
val_frames_list = []

for game_id, frames in unique_frames.items():
    frames = np.array(list(frames))
    n_frames = len(frames)
    
    rng = np.random.RandomState(42)
    rng.shuffle(frames)
    
    n_train = int(0.8 * n_frames)
    train_frames_list.extend([(game_id, f) for f in frames[:n_train]])
    val_frames_list.extend([(game_id, f) for f in frames[n_train:]])

train_frame_set = set(train_frames_list)
val_frame_set = set(val_frames_list)

def assign_split_all_games(row):
    key = (row['game_id'], row['frame_id'])
    if key in train_frame_set:
        return 'train'
    elif key in val_frame_set:
        return 'val'
    else:
        return None

all_games_df['split'] = all_games_df.apply(assign_split_all_games, axis=1)
all_games_df = all_games_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Save manifest
all_games_manifest_path = 'dataset_out/all_games_manifest.csv'
all_games_df.to_csv(all_games_manifest_path, index=False)

n_train = (all_games_df['split'] == 'train').sum()
n_val = (all_games_df['split'] == 'val').sum()
n_train_frames = all_games_df[all_games_df['split'] == 'train']['frame_id'].nunique()
n_val_frames = all_games_df[all_games_df['split'] == 'val']['frame_id'].nunique()

print(f"Train: {n_train_frames} frames ({n_train:,} squares) - 80%")
print(f"Val:   {n_val_frames} frames ({n_val:,} squares) - 20%")
print(f"Total: {n_train + n_val:,} squares from all 5 games")
print("="*60)

PER-GAME SPLIT (Each game -> 70/15/15)

game2_per_frame: 77 frames
  Train: 53 frames (3,392 sq)
  Val:   11 frames (704 sq)
  Test:  13 frames (832 sq)

game4_per_frame: 184 frames
  Train: 128 frames (8,192 sq)
  Val:   27 frames (1,728 sq)
  Test:  29 frames (1,856 sq)

game5_per_frame: 109 frames
  Train: 76 frames (4,928 sq)
  Val:   16 frames (1,024 sq)
  Test:  17 frames (1,088 sq)

game6_per_frame: 92 frames
  Train: 64 frames (4,096 sq)
  Val:   13 frames (832 sq)
  Test:  15 frames (960 sq)

game7_per_frame: 55 frames
  Train: 38 frames (2,496 sq)
  Val:   8 frames (512 sq)
  Test:  9 frames (576 sq)

COMBINED DATASET
TRAIN: 23,104 squares, 350 frames
VAL: 4,800 squares, 75 frames
TEST: 5,312 squares, 82 frames

Total: 33,216 squares
All games contribute to all three splits


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import json
import os
from tqdm import tqdm
import time
import datetime
import random
import numpy as np

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

print("="*60)
print("PHASE 1: 2-FOLD CROSS-VALIDATION (PROOF OF LEARNING)")
print("="*60)
print(f"Start Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Random Seed: 42 (for reproducibility)")
print("="*60)

# Load class mapping
with open('dataset_out/classes.json', 'r') as f:
    classes = json.load(f)
num_classes = len(classes)
print(f"Number of classes: {num_classes}")

# Load fold summary
fold_summary = pd.read_csv('dataset_out/fold_summary.csv')
print(f"Number of folds: {len(fold_summary)}")

# Dataset class
class ChessSquareDataset(Dataset):
    def __init__(self, manifest_df, transform=None):
        self.data = manifest_df.reset_index(drop=True)
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['frame_path']
        
        img = Image.open(img_path).convert('RGB')
        
        x = int(row['col'] * (img.width / 8))
        y = int(row['row'] * (img.height / 8))
        w = int(img.width / 8)
        h = int(img.height / 8)
        
        square_img = img.crop((x, y, x+w, y+h))
        
        if self.transform:
            square_img = self.transform(square_img)
        
        label = row['label_id']
        return square_img, label

# Transforms
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Training functions
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validating", leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nDevice: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Hyperparameters
BATCH_SIZE = 128
LEARNING_RATE = 0.001
NUM_EPOCHS = 8
NUM_WORKERS = 2

print(f"\nHyperparameters:")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Workers: {NUM_WORKERS}")
print("="*60)

# Results storage
all_results = []

# Train only 2 folds for proof of learning
for fold_idx in range(len(fold_summary)):
    fold_num = fold_summary.iloc[fold_idx]['fold']
    test_game = fold_summary.iloc[fold_idx]['test_game']
    manifest_path = fold_summary.iloc[fold_idx]['manifest_path']
    
    print(f"\n{'='*60}")
    print(f"FOLD {fold_num}/{len(fold_summary)}")
    print(f"Test Game: {test_game}")
    print(f"{'='*60}")
    
    # Load manifest
    manifest_df = pd.read_csv(manifest_path)
    
    train_df = manifest_df[manifest_df['split'] == 'train']
    val_df = manifest_df[manifest_df['split'] == 'val']
    test_df = manifest_df[manifest_df['split'] == 'test']
    
    print(f"Train: {len(train_df):,} squares")
    print(f"Val:   {len(val_df):,} squares")
    print(f"Test:  {len(test_df):,} squares")
    
    # Create datasets and dataloaders
    train_dataset = ChessSquareDataset(train_df, transform=train_transform)
    val_dataset = ChessSquareDataset(val_df, transform=val_transform)
    test_dataset = ChessSquareDataset(test_df, transform=val_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    
    # Initialize model (fresh ResNet50 for each fold)
    model = models.resnet50(weights='IMAGENET1K_V1')
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model = model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
    
    # Training loop
    best_val_acc = 0.0
    best_epoch = 0
    fold_start_time = time.time()
    
    for epoch in range(NUM_EPOCHS):
        epoch_start = time.time()
        print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
        
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        
        epoch_time = time.time() - epoch_start
        
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Loss:   {val_loss:.4f}, Val Acc:   {val_acc:.2f}%")
        print(f"  Epoch Time: {epoch_time:.1f}s")
        
        # Learning rate scheduling
        scheduler.step(val_acc)
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch + 1
            
            checkpoint_path = f'dataset_out/best_model_fold_{fold_num}.pth'
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch + 1,
                'val_acc': val_acc,
                'fold': fold_num,
                'test_game': test_game
            }, checkpoint_path, _use_new_zipfile_serialization=False)
            print(f"  ✓ Best model saved! (Val Acc: {val_acc:.2f}%)")
    
    # Load best model and evaluate on test set
    print(f"\n--- Testing Fold {fold_num} ---")
    checkpoint = torch.load(f'dataset_out/best_model_fold_{fold_num}.pth', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    test_loss, test_acc = validate(model, test_loader, criterion, device)
    
    fold_time = time.time() - fold_start_time
    
    print(f"Best Val Acc:  {best_val_acc:.2f}% (Epoch {best_epoch})")
    print(f"Test Acc:      {test_acc:.2f}%")
    print(f"Fold Duration: {fold_time/60:.1f} minutes")
    
    all_results.append({
        'fold': fold_num,
        'test_game': test_game,
        'best_val_acc': best_val_acc,
        'best_epoch': best_epoch,
        'test_acc': test_acc,
        'time_minutes': fold_time / 60
    })

# Summary
print("\n" + "="*60)
print("2-FOLD CROSS-VALIDATION RESULTS")
print("="*60)
results_df = pd.DataFrame(all_results)
print(results_df.to_string(index=False))

print(f"\nAverage Validation Accuracy: {results_df['best_val_acc'].mean():.2f}%")
print(f"Average Test Accuracy:       {results_df['test_acc'].mean():.2f}%")
print(f"Total Training Time:         {results_df['time_minutes'].sum():.1f} minutes")
print("="*60)

# Save results
results_df.to_csv('dataset_out/fold_results.csv', index=False)
print("\n✓ 2-fold validation complete! Results saved to dataset_out/fold_results.csv")

Using device: cuda

Creating dataloaders...
Caching squares from 359 unique images...
  Cached 20/359 images...
  Cached 40/359 images...
  Cached 60/359 images...
  Cached 80/359 images...
  Cached 100/359 images...
  Cached 120/359 images...
  Cached 140/359 images...
  Cached 160/359 images...
  Cached 180/359 images...
  Cached 200/359 images...
  Cached 220/359 images...
  Cached 240/359 images...
  Cached 260/359 images...
  Cached 280/359 images...
  Cached 300/359 images...
  Cached 320/359 images...
  Cached 340/359 images...
✓ Cached 359 images with 22976 squares
Dataset initialized: 23104 samples, 13 classes
Created train loader: 23104 samples, 180 batches
Dataset initialized: 4800 samples, 13 classes
Created val loader: 4800 samples, 38 batches
Dataset initialized: 5312 samples, 13 classes
Created test loader: 5312 samples, 42 batches

Creating model...
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-06

In [None]:
set_seed(42)

print("\n" + "="*60)
print("PHASE 2: TRAINING ON ALL 5 GAMES (FINAL MODEL)")
print("="*60)
print(f"Start Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Load all games manifest
all_manifest = pd.read_csv('dataset_out/all_games_manifest.csv')
train_df = all_manifest[all_manifest['split'] == 'train']
val_df = all_manifest[all_manifest['split'] == 'val']

print(f"\nTrain: {len(train_df):,} squares (80% of all 5 games)")
print(f"Val:   {len(val_df):,} squares (20% of all 5 games)")

# Create datasets
train_dataset = ChessSquareDataset(train_df, transform=train_transform)
val_dataset = ChessSquareDataset(val_df, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# Initialize fresh model for all games
model = models.resnet50(weights='IMAGENET1K_V1')
model.fc = nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

# Training loop
best_val_acc = 0.0
best_epoch = 0
history = []
train_start = time.time()

print("\nTraining for 8 epochs...")

for epoch in range(NUM_EPOCHS):
    epoch_start = time.time()
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    
    epoch_time = time.time() - epoch_start
    
    history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'train_acc': train_acc,
        'val_loss': val_loss,
        'val_acc': val_acc,
        'lr': optimizer.param_groups[0]['lr'],
        'time_sec': epoch_time
    })
    
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"  Val Loss:   {val_loss:.4f}, Val Acc:   {val_acc:.2f}%")
    print(f"  Epoch Time: {epoch_time:.1f}s")
    print(f"  LR:         {optimizer.param_groups[0]['lr']:.6f}")
    
    scheduler.step(val_acc)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_epoch = epoch + 1
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'train_acc': train_acc,
            'games': 'all_5_games'
        }, 'dataset_out/best_model_all_games.pth', _use_new_zipfile_serialization=False)
        print(f"  ✓ Best model saved! (Val Acc: {val_acc:.2f}%)")

train_time = time.time() - train_start

# Save training history
pd.DataFrame(history).to_csv('dataset_out/all_games_history.csv', index=False)

print("\n" + "="*60)
print("TRAINING ON ALL 5 GAMES COMPLETE")
print("="*60)
print(f"Best Val Acc:    {best_val_acc:.2f}% (epoch {best_epoch})")
print(f"Training Time:   {train_time/60:.1f} minutes")
print(f"Model Saved:     dataset_out/best_model_all_games.pth")
print("\n✓ Final production model ready!")
print("="*60)

In [None]:
# Download the final trained model
from google.colab import files

print("="*60)
print("DOWNLOADING FINAL MODEL")
print("="*60)

model_path = 'dataset_out/best_model_all_games.pth'
print(f"\nDownloading: {model_path}")
files.download(model_path)

print("\n✓ Model downloaded!")
print("\nTo use in your web app:")
print("1. Rename to: best_model_fold_1.pth")
print("2. Place in: checkpoints/ folder")
print("3. Run: python app.py")
print("="*60)