In [1]:

#importing standard python libraries for file system operations, randomness, path handling, data structure utilities and type hinting 
import os
import random
from pathlib import Path
from collections import Counter
from typing import Tuple, List

#import numpy for numerical operations and PIL images for loading and manipulating images
import numpy as np
from PIL import Image

#import core torch modules for building the neural network
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

#import the computer vision libraries for augmentations(transforms), models(pretrained backbone), etc
from torchvision import transforms, models
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm





In [2]:
# ============================================================
#                  1. Label Smoothing Loss
# ============================================================

class LabelSmoothingCrossEntropy(nn.Module):
    """
    Label smoothing prevents overconfident predictions and improves generalization.
    This is crucial for reaching 75-80% accuracy.
    """
    #initialize the loss functino with smoothing factor of 0.1 and optional weight for handling class imbalance
    def __init__(self, epsilon: float = 0.1, weight=None):
        super().__init__()
        self.epsilon = epsilon
        self.weight = weight
    #define the forward pass of the loss     
    def forward(self, outputs, targets):
        n_classes = outputs.size(-1) #determines number of classes from size of model's raw output tensor
        log_preds = F.log_softmax(outputs, dim=-1) #calculate log-softmax of model's outputs needed for loss calculation
        
        # Smooth labels
        with torch.no_grad():   #disables gradient calculation as the target distribution is not learned (the labels do not require gradients because they are constants)
            true_dist = torch.zeros_like(log_preds) #initializing tensor that will hold the smoothed target probabilities
            true_dist.fill_(self.epsilon / (n_classes - 1)) #true_dist represents target distribution for 1 example, this fn assigns a small equal probability to every incorrect class when applying label smoothing
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - self.epsilon) #sets probability for correct class to be 1-epsilon
            
            # Apply per-class weights if provided to penalize misclassifications of rare emotions more heavily
            if self.weight is not None:
                true_dist = true_dist * self.weight.unsqueeze(0)
        
        loss = torch.sum(-true_dist * log_preds, dim=-1) #computes loss, kullback-leibler divergence is used here, which is equivalent to crossentropy
        return loss.mean()  #return mean loss across entire batch




In [3]:
# ============================================================
#                  2. Advanced Mixup with CutMix
# ============================================================

def mixup_data(x, y, alpha=0.4, device='cuda'): #define mixup function
    """Enhanced mixup for better generalization"""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
        lam = max(lam, 1 - lam)  # Ensure lam >= 0.5 for stability
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def cutmix_data(x, y, alpha=1.0, device='cuda'):    #define cutmix function
    """
    CutMix augmentation: randomly cuts and pastes patches between images.
    Better for facial features than standard mixup.
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    # Get random box
    W, H = x.size(2), x.size(3)
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # Uniform sampling
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    # Apply cutmix
    x_cutmix = x.clone()
    x_cutmix[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]    #replaces the defined patch in the original image with the corresponding patch from a shuffled image
    
    # Adjust lambda to exactly match pixel ratio
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (W * H))
    
    y_a, y_b = y, y[index]
    return x_cutmix, y_a, y_b, lam

In [4]:
# ============================================================
#                  3. Data Collection
# ============================================================

def gather_image_paths_and_labels(root_dir: str) -> Tuple[List[str], List[int], List[str]]:
    """Collect all images from emotion class subdirectories."""
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(f"Dataset root not found: {root_dir}")

    image_data = []
    exts = {".jpg", ".jpeg", ".png", ".bmp"}
    
    for img_path in root.rglob("*"): #recursively search the root directory
        if img_path.suffix.lower() in exts: #check if file is within the extensions
            emotion_class = img_path.parent.name    #infers emotion label from parent folder name 
            if emotion_class in ['train', 'test']:
                emotion_class = img_path.parent.parent.name
            
            image_data.append((str(img_path), emotion_class))
    
    if not image_data:
        raise ValueError(f"No images found under {root_dir}")
    
    #here we identify and sort the unique emotion names, then create a mapping from name to index
    unique_classes = sorted(set(emotion for _, emotion in image_data))
    class_to_idx = {name: idx for idx, name in enumerate(unique_classes)}
    
    image_paths = [path for path, _ in image_data]
    labels = [class_to_idx[emotion] for _, emotion in image_data]
    
    return image_paths, labels, unique_classes

In [5]:
# ============================================================
#                  4. Enhanced Pre-cached Dataset
# ============================================================

class PreCachedImageDataset(Dataset):   #class designed to load all image data into RAM before training for max I/O speed
    """Ultra-fast dataset with pre-cached tensors."""
    def __init__(self, image_paths: List[str], labels: List[int], 
                 transform=None, cache_images: bool = True, img_size: int = 224,
                 is_train: bool = True):
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform
        self.is_train = is_train
        self.cached_tensors = []
        
        if cache_images:    #check configuration before starting the caching process 
            print(f"Pre-loading {len(image_paths)} images...")
            
            #defining imagenet mean and std used for normalization 
            imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
            imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
            
            #resize image and convert to pytorch tensor 
            resize_transform = transforms.Compose([
                transforms.Resize(img_size),
                transforms.ToTensor(),
            ])
            
            for path in tqdm(image_paths, desc="Caching"):
                try:
                    img = Image.open(path).convert("RGB")
                    img_tensor = resize_transform(img)
                    
                    if not is_train:
                        img_tensor = (img_tensor - imagenet_mean) / imagenet_std
                    
                    self.cached_tensors.append(img_tensor)
                except Exception as e:
                    blank = torch.zeros(3, img_size, img_size)
                    if not is_train:
                        blank = (blank - imagenet_mean) / imagenet_std
                    self.cached_tensors.append(blank)
        else:
            self.cached_tensors = None
            self.image_paths = image_paths

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.cached_tensors is not None:
            img = self.cached_tensors[idx]
            if self.is_train and self.transform:    #applying the augmentations on the fly if it is a train_sample 
                img = self.transform(img)
        else:
            img = Image.open(self.image_paths[idx]).convert("RGB")
            if self.transform:
                img = self.transform(img)
        
        return img, self.labels[idx]

In [6]:
# ============================================================
#                  5. Advanced Transforms
# ============================================================

def get_augmentation_transforms() -> Tuple[transforms.Compose, transforms.Compose]:
    """
    Strong augmentation strategy for better generalization.
    """
    #defining the nornmalization parameters
    imagenet_mean = [0.485, 0.456, 0.406]
    imagenet_std = [0.229, 0.224, 0.225]
    
    # Training: Strong augmentation
    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(20),  # Increased rotation
        transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2, hue=0.1),
        transforms.RandomAffine(degrees=0, translate=(0.15, 0.15), scale=(0.85, 1.15)),
        transforms.RandomPerspective(distortion_scale=0.2, p=0.3),  # Add perspective
        transforms.Normalize(imagenet_mean, imagenet_std),
        transforms.RandomErasing(p=0.3, scale=(0.02, 0.2)),  # Stronger erasing
    ])
    
    val_transform = transforms.Compose([
        transforms.Normalize(imagenet_mean, imagenet_std),
    ])
    
    return train_transform, val_transform


In [7]:
# ============================================================
#                  6. Enhanced Model with Attention
# ============================================================

def build_model(model_name: str = 'resnet34', num_classes: int = 7, 
                dropout_rate: float = 0.5) -> nn.Module:
    """
    Build enhanced ResNet with better classifier.
    Note: SE attention is added to layer4 of the backbone, not the fc layer.
    """
    if model_name == 'resnet18':
        weights = models.ResNet18_Weights.IMAGENET1K_V1
        model = models.resnet18(weights=weights)
    elif model_name == 'resnet34':
        weights = models.ResNet34_Weights.IMAGENET1K_V1
        model = models.resnet34(weights=weights)
    elif model_name == 'resnet50':
        weights = models.ResNet50_Weights.IMAGENET1K_V1
        model = models.resnet50(weights=weights)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    in_features = model.fc.in_features
    
    # Enhanced 3-layer classifier for better capacity
    model.fc = nn.Sequential(
        nn.BatchNorm1d(in_features),
        nn.Dropout(dropout_rate * 0.5),
        nn.Linear(in_features, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(inplace=True),
        nn.Dropout(dropout_rate),
        nn.Linear(1024, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(inplace=True),
        nn.Dropout(dropout_rate),
        nn.Linear(512, num_classes)
    )

    return model


In [8]:
# ============================================================
#                  7. Training Functions with Advanced Augmentation
# ============================================================

def train_one_epoch(model: nn.Module, loader: DataLoader, criterion: nn.Module,
                   optimizer: optim.Optimizer, device: torch.device, epoch: int,
                   scaler: GradScaler = None, use_mixup: bool = True, 
                   mixup_alpha: float = 0.4, use_amp: bool = False,
                   use_cutmix: bool = True) -> Tuple[float, float]:
    """
    Training with mixup/cutmix alternation for better generalization.
    """
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch} [Train]", leave=False)

    for images, labels in pbar:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # Randomly choose between mixup and cutmix
        use_aug = random.random() < 0.5
        use_cutmix_now = use_cutmix and random.random() < 0.5
        
        if use_amp and scaler is not None:
            with autocast(device_type='cuda'):
                if use_aug and use_mixup:
                    if use_cutmix_now:
                        mixed_images, labels_a, labels_b, lam = cutmix_data(images, labels, 1.0, device)
                    else:
                        mixed_images, labels_a, labels_b, lam = mixup_data(images, labels, mixup_alpha, device)
                    
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
                    
                    preds = outputs.argmax(dim=1)
                    correct += (lam * (preds == labels_a).sum().item() + (1 - lam) * (preds == labels_b).sum().item())
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    
                    preds = outputs.argmax(dim=1)
                    correct += (preds == labels).sum().item()

            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()
        else:
            if use_aug and use_mixup:
                if use_cutmix_now:
                    mixed_images, labels_a, labels_b, lam = cutmix_data(images, labels, 1.0, device)
                else:
                    mixed_images, labels_a, labels_b, lam = mixup_data(images, labels, mixup_alpha, device)
                
                outputs = model(mixed_images)
                loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
                
                preds = outputs.argmax(dim=1)
                correct += (lam * (preds == labels_a).sum().item() + (1 - lam) * (preds == labels_b).sum().item())
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
                
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        running_loss += loss.item()
        total += images.size(0)

        if pbar.n % max(1, len(loader) // 20) == 0:
            avg_loss = running_loss / (pbar.n + 1)
            avg_acc = 100.0 * correct / total
            pbar.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{avg_acc:.2f}%"})

    avg_loss = running_loss / len(loader)
    avg_acc = 100.0 * correct / total
    return avg_loss, avg_acc

In [9]:
def evaluate(model: nn.Module, loader: DataLoader, criterion: nn.Module,
            device: torch.device, epoch: int, use_amp: bool = False,
            use_tta: bool = False) -> Tuple[float, float]:
    """
    Evaluation with optional Test-Time Augmentation.
    """
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch} [Val]", leave=False)

    with torch.no_grad():
        for images, labels in pbar:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            if use_amp:
                with autocast(device_type='cuda'):
                    if use_tta:
                        # Test-Time Augmentation: horizontal flip
                        outputs1 = model(images)
                        outputs2 = model(torch.flip(images, dims=[3]))
                        outputs = (outputs1 + outputs2) / 2
                    else:
                        outputs = model(images)
                    loss = criterion(outputs, labels)
            else:
                if use_tta:
                    outputs1 = model(images)
                    outputs2 = model(torch.flip(images, dims=[3]))
                    outputs = (outputs1 + outputs2) / 2
                else:
                    outputs = model(images)
                loss = criterion(outputs, labels)

            running_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += images.size(0)

            if pbar.n % max(1, len(loader) // 10) == 0:
                avg_loss = running_loss / (pbar.n + 1)
                avg_acc = 100.0 * correct / total
                pbar.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{avg_acc:.2f}%"})

    avg_loss = running_loss / len(loader)
    avg_acc = 100.0 * correct / total
    return avg_loss, avg_acc


In [10]:
# ============================================================
#                  8. Main Training with OneCycleLR
# ============================================================

def train_advanced(model, train_loader, val_loader, criterion, 
                   device, CONFIG, class_names):
    """
    Advanced training with OneCycleLR for better convergence.
    """
    optimizer = optim.AdamW(
        model.parameters(),
        lr=CONFIG['lr'],
        weight_decay=CONFIG['weight_decay']
    )
    
    # OneCycleLR: proven to reach higher accuracy faster
    total_steps = len(train_loader) * CONFIG['epochs']
    scheduler = OneCycleLR(
        optimizer,
        max_lr=CONFIG['lr'] * 10,  # Peak LR
        total_steps=total_steps,
        pct_start=0.3,  # Warmup for 30% of training
        anneal_strategy='cos',
        div_factor=25.0,
        final_div_factor=10000.0
    )
    
    use_amp = torch.cuda.is_available()
    scaler = GradScaler() if use_amp else None
    
    best_val_acc = 0.0
    best_epoch = 0
    patience_counter = 0
    
    print("\n" + "="*60)
    print("Starting high-accuracy training...")
    if use_amp:
        print("Mixed Precision: ENABLED")
    else:
        print("Mixed Precision: DISABLED (CPU mode)")
    print(f"Target: 75-80% validation accuracy")
    print("="*60)
    
    for epoch in range(1, CONFIG['epochs'] + 1):
        train_loss, train_acc = train_one_epoch(
            model, train_loader, criterion, optimizer, device, epoch,
            scaler, use_mixup=CONFIG['use_mixup'], 
            mixup_alpha=CONFIG['mixup_alpha'], use_amp=use_amp,
            use_cutmix=CONFIG['use_cutmix']
        )
        
        # Use TTA after epoch 20 for better validation accuracy
        use_tta = CONFIG['use_tta'] and epoch > 20
        val_loss, val_acc = evaluate(
            model, val_loader, criterion, device, epoch, 
            use_amp=use_amp, use_tta=use_tta
        )
        
        # Calculate overfitting gap
        gap = train_acc - val_acc
        
        print(f"\n[Epoch {epoch}/{CONFIG['epochs']}]")
        print(f"  Train -> Loss: {train_loss:.4f}  Acc: {train_acc:.2f}%")
        print(f"  Val   -> Loss: {val_loss:.4f}  Acc: {val_acc:.2f}%")
        print(f"  Gap: {gap:.2f}% | LR: {optimizer.param_groups[0]['lr']:.2e}")
        if use_tta:
            print(f"  TTA: Enabled")
        
        # Step scheduler per batch (OneCycleLR requirement handled in training loop)
        # scheduler.step() is called per batch inside train_one_epoch
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch
            patience_counter = 0
            save_checkpoint(model, optimizer, epoch, val_acc, val_loss, 
                          class_names, CONFIG)
            print(f"  ✓ New best! Val Acc: {val_acc:.2f}% (Gap: {gap:.2f}%)")
        else:
            patience_counter += 1
            print(f"  No improvement ({patience_counter}/{CONFIG['patience']})")
            
            if patience_counter >= CONFIG['patience']:
                print(f"\n{'='*60}")
                print(f"Early stopping triggered")
                print("="*60)
                break
    
    return best_val_acc, best_epoch

In [11]:
def save_checkpoint(model, optimizer, epoch, val_acc, val_loss, class_names, CONFIG):
    """Save model checkpoint"""
    best_path = os.path.join(CONFIG['save_dir'], "best_model.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_acc': val_acc,
        'val_loss': val_loss,
        'class_names': class_names,
        'config': CONFIG
    }, best_path)

In [12]:
# ============================================================
#                  9. Main Pipeline
# ============================================================

def main():
    # ============= High-Accuracy Configuration =============
    CONFIG = {
        'data_root': 'C:/adam/AMIT_Diploma/grad_project/archive_v1',
        'model_name': 'resnet34',  # ResNet34 for better capacity
        'img_size': 224,
        'batch_size': 64,  # Smaller batch for better generalization
        'epochs': 60,  # Extended training
        'lr': 3e-4,  # Lower LR for stability
        'weight_decay': 2e-4,  # Stronger regularization
        'dropout': 0.5,
        'val_size': 0.15,
        'random_seed': 42,
        'num_workers': 0,
        'patience': 12,  # More patience for convergence
        'use_class_weights': True,
        'use_label_smoothing': True,  # Key for generalization
        'label_smooth_eps': 0.1,
        'use_mixup': True,
        'mixup_alpha': 0.4,  # Stronger mixup
        'use_cutmix': True,  # Add CutMix
        'use_tta': True,  # Test-Time Augmentation
        'save_dir': './checkpoints',
        'cache_images': True,
    }

    # ============= Setup =============
    random.seed(CONFIG['random_seed'])
    np.random.seed(CONFIG['random_seed'])
    torch.manual_seed(CONFIG['random_seed'])
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(CONFIG['random_seed'])
        torch.backends.cudnn.benchmark = True

    os.makedirs(CONFIG['save_dir'], exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    else:
        print(f"WARNING: CPU training - will take 6-10 hours")

    # ============= Load Data =============
    print("\n" + "="*60)
    print("Loading dataset...")
    print("="*60)
    
    image_paths, labels, class_names = gather_image_paths_and_labels(CONFIG['data_root'])
    num_classes = len(class_names)
    
    print(f"\nDataset Statistics:")
    print(f"  Total images: {len(image_paths)}")
    print(f"  Classes ({num_classes}): {class_names}")
    
    label_counts = Counter(labels)
    print(f"\n  Class distribution:")
    for cls_idx, cls_name in enumerate(class_names):
        print(f"    {cls_name}: {label_counts[cls_idx]}")

    # ============= Stratified Split =============
    print(f"\nCreating stratified split (val_size={CONFIG['val_size']})...")
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=CONFIG['val_size'],
                                random_state=CONFIG['random_seed'])
    indices = np.arange(len(labels))
    train_idx, val_idx = next(sss.split(indices, labels))

    train_paths = [image_paths[i] for i in train_idx]
    train_labels_list = [labels[i] for i in train_idx]
    val_paths = [image_paths[i] for i in val_idx]
    val_labels_list = [labels[i] for i in val_idx]

    print(f"  Train samples: {len(train_paths)}")
    print(f"  Val samples: {len(val_paths)}")

    # ============= Create Datasets =============
    train_tf, val_tf = get_augmentation_transforms()
    
    train_dataset = PreCachedImageDataset(
        train_paths, train_labels_list, 
        transform=train_tf, 
        cache_images=CONFIG['cache_images'],
        img_size=CONFIG['img_size'],
        is_train=True
    )
    val_dataset = PreCachedImageDataset(
        val_paths, val_labels_list,
        transform=None,
        cache_images=CONFIG['cache_images'],
        img_size=CONFIG['img_size'],
        is_train=False
    )

    train_loader = DataLoader(
        train_dataset, 
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False,
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False,
    )

    # ============= Build Enhanced Model =============
    print(f"\n{'='*60}")
    print("Building enhanced model...")
    print("="*60)
    
    model = build_model(
        model_name=CONFIG['model_name'],
        num_classes=num_classes,
        dropout_rate=CONFIG['dropout']
    ).to(device)
    
    print(f"  Model: {CONFIG['model_name']}")
    print(f"  Architecture: {CONFIG['model_name'].upper()} + 3-layer classifier")
    print(f"  Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # ============= Loss with Label Smoothing =============
    if CONFIG['use_class_weights']:
        train_counts = np.bincount(train_labels_list, minlength=num_classes)
        class_weights = 1.0 / (train_counts + 1e-6)
        class_weights = class_weights * (len(train_labels_list) / class_weights.sum())
        class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
        print(f"\n  Class weights: {class_weights.cpu().numpy()}")
    else:
        class_weights = None
    
    if CONFIG['use_label_smoothing']:
        criterion = LabelSmoothingCrossEntropy(
            epsilon=CONFIG['label_smooth_eps'], 
            weight=class_weights
        )
        print(f"  Loss: Label Smoothing CE (eps={CONFIG['label_smooth_eps']})")
    else:
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        print(f"  Loss: Standard Cross Entropy")

    # ============= Training =============
    best_val_acc, best_epoch = train_advanced(
        model, train_loader, val_loader, criterion, device, CONFIG, class_names
    )

    # ============= Final Results =============
    print(f"\n{'='*60}")
    print("Training Complete!")
    print("="*60)
    print(f"  Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch})")
    print(f"  Target Range: 75-80%")
    if best_val_acc >= 75:
        print(f"  ✓ TARGET ACHIEVED!")
    else:
        print(f"  Gap to target: {75 - best_val_acc:.2f}%")
    print(f"  Model saved to: {CONFIG['save_dir']}/best_model.pth")

    # ============= Final Results =============
    print(f"\n{'='*60}")
    print("Training Complete!")
    print("="*60)
    print(f"  Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch})")
    print(f"  Target Range: 75-80%")
    if best_val_acc >= 75:
        print(f"  ✓ TARGET ACHIEVED!")
    else:
        print(f"  Gap to target: {75 - best_val_acc:.2f}%")
    print(f"  Model saved to: {CONFIG['save_dir']}/best_model.pth")

In [None]:
if __name__ == "__main__":
    main()

Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU

Loading dataset...

Dataset Statistics:
  Total images: 35887
  Classes (7): ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

  Class distribution:
    angry: 4953
    disgusted: 547
    fearful: 5121
    happy: 8989
    neutral: 6198
    sad: 6077
    surprised: 4002

Creating stratified split (val_size=0.15)...
  Train samples: 30503
  Val samples: 5384
Pre-loading 30503 images...


Caching:   8%|▊         | 2321/30503 [00:26<03:08, 149.13it/s]

In [None]:
torch.cuda.is_available()

False

In [None]:
!nvidia-smi

Sun Nov 30 17:03:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 581.29                 Driver Version: 581.29         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   36C    P0              9W /   80W |       0MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [None]:
import torch
print(torch.__version__)


2.9.1+cpu
