In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from PIL import Image
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Let's load the different bird species from the `class_names.npy` file and then the attributes from `attributes.npy` which has for every class 312 features that are explained by the file `attributes.txt`.

In [10]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [11]:
glob_path = "drive/MyDrive/"

In [12]:
bird_classes = np.load(glob_path + "data/class_names.npy", allow_pickle=True).item()

In [13]:
attributes = np.load(glob_path + 'data/attributes.npy')
attributes.shape

(200, 312)

In [14]:
with open(glob_path + "data/attributes.txt", "r") as f:
    attribute_names = [line.strip() for line in f.readlines()]

attribute_names[:5]

['1 has_bill_shape::curved_(up_or_down)',
 '2 has_bill_shape::dagger',
 '3 has_bill_shape::hooked',
 '4 has_bill_shape::needle',
 '5 has_bill_shape::hooked_seabird']

Unify the attributes files to map for every bird species they're features

In [15]:
class_attributes = {}

for class_id in range(attributes.shape[0]):
    class_attributes[class_id + 1] = {
        attribute_names[i]: attributes[class_id, i] for i in range(len(attribute_names))
    }

Create a data frame `birds_df` with the class_id and the 312 attrbiutes of each bird class. Then merge it with the class name of each bird.

In [16]:
birds_df = pd.DataFrame.from_dict(class_attributes, orient="index")
birds_df.index.name = "class_id"
birds_df.reset_index(inplace=True)
birds_df.head()

Unnamed: 0,class_id,1 has_bill_shape::curved_(up_or_down),2 has_bill_shape::dagger,3 has_bill_shape::hooked,4 has_bill_shape::needle,5 has_bill_shape::hooked_seabird,6 has_bill_shape::spatulate,7 has_bill_shape::all-purpose,8 has_bill_shape::cone,9 has_bill_shape::specialized,...,303 has_crown_color::pink,304 has_crown_color::orange,305 has_crown_color::black,306 has_crown_color::white,307 has_crown_color::red,308 has_crown_color::buff,309 has_wing_pattern::solid,310 has_wing_pattern::spotted,311 has_wing_pattern::striped,312 has_wing_pattern::multi-colored
0,1,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.0,0.005319,0.0,...,0.0,0.005439,0.005439,0.228446,0.0,0.0,0.18602,0.009186,0.025262,0.020669
1,2,0.0,0.011332,0.009444,0.0,0.202095,0.041552,0.01511,0.005666,0.0,...,0.006291,0.0,0.111144,0.008388,0.0,0.046135,0.202572,0.002665,0.021323,0.058639
2,3,0.0,0.0,0.007425,0.0,0.002475,0.0,0.0,0.074247,0.14602,...,0.0,0.0,0.190411,0.012555,0.0,0.010462,0.203609,0.0,0.008853,0.017705
3,4,0.0,0.0,0.003861,0.0,0.003861,0.013514,0.005792,0.07336,0.138998,...,0.004885,0.0,0.190531,0.0,0.0,0.0,0.15275,0.00684,0.036478,0.043317
4,5,0.0,0.035088,0.0,0.0,0.0,0.0,0.102458,0.070177,0.0,...,0.0,0.0,0.204036,0.002458,0.002458,0.0,0.03164,0.002751,0.015132,0.1582


In [17]:
classes = pd.DataFrame.from_dict(bird_classes, orient="index").reset_index()
classes.columns = ["class", "id"]
classes.head()

Unnamed: 0,class,id
0,001.Black_footed_Albatross,1
1,002.Laysan_Albatross,2
2,003.Sooty_Albatross,3
3,004.Groove_billed_Ani,4
4,005.Crested_Auklet,5


In [18]:
birds_df = birds_df.merge(classes, left_on="class_id", right_on="id")
birds_df = birds_df.drop(columns=["id"])

# Reorder columns to have class_id and class first
cols = ["class_id", "class"] + [c for c in birds_df.columns if c not in ["class_id", "class"]]
birds_df = birds_df[cols]
birds_df.head()

Unnamed: 0,class_id,class,1 has_bill_shape::curved_(up_or_down),2 has_bill_shape::dagger,3 has_bill_shape::hooked,4 has_bill_shape::needle,5 has_bill_shape::hooked_seabird,6 has_bill_shape::spatulate,7 has_bill_shape::all-purpose,8 has_bill_shape::cone,...,303 has_crown_color::pink,304 has_crown_color::orange,305 has_crown_color::black,306 has_crown_color::white,307 has_crown_color::red,308 has_crown_color::buff,309 has_wing_pattern::solid,310 has_wing_pattern::spotted,311 has_wing_pattern::striped,312 has_wing_pattern::multi-colored
0,1,001.Black_footed_Albatross,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.0,0.005319,...,0.0,0.005439,0.005439,0.228446,0.0,0.0,0.18602,0.009186,0.025262,0.020669
1,2,002.Laysan_Albatross,0.0,0.011332,0.009444,0.0,0.202095,0.041552,0.01511,0.005666,...,0.006291,0.0,0.111144,0.008388,0.0,0.046135,0.202572,0.002665,0.021323,0.058639
2,3,003.Sooty_Albatross,0.0,0.0,0.007425,0.0,0.002475,0.0,0.0,0.074247,...,0.0,0.0,0.190411,0.012555,0.0,0.010462,0.203609,0.0,0.008853,0.017705
3,4,004.Groove_billed_Ani,0.0,0.0,0.003861,0.0,0.003861,0.013514,0.005792,0.07336,...,0.004885,0.0,0.190531,0.0,0.0,0.0,0.15275,0.00684,0.036478,0.043317
4,5,005.Crested_Auklet,0.0,0.035088,0.0,0.0,0.0,0.0,0.102458,0.070177,...,0.0,0.0,0.204036,0.002458,0.002458,0.0,0.03164,0.002751,0.015132,0.1582


In [19]:
images_df = pd.read_csv(glob_path + "data/train_images.csv")
images_df['image_path'] = glob_path + 'data' + images_df['image_path']
images_df.head()

Unnamed: 0,image_path,label
0,drive/MyDrive/data/train_images/1.jpg,1
1,drive/MyDrive/data/train_images/2.jpg,1
2,drive/MyDrive/data/train_images/3.jpg,1
3,drive/MyDrive/data/train_images/4.jpg,1
4,drive/MyDrive/data/train_images/5.jpg,1


### Load training metadata and create train/validation split

In this step, we load the `train_images.csv` file that contains the image paths and labels.  
Then we create a stratified train/validation split so that all 200 classes are represented proportionally in both sets.  
This split will be used to train the CNN on `train_images` and evaluate it on `val_images`.


In [20]:
train_images, val_images = train_test_split(
    images_df,
    test_size=0.2,
    stratify=images_df["label"],
    random_state=42
)

len(train_images), len(val_images)

(3140, 786)

In [21]:
train_images.head()

Unnamed: 0,image_path,label
1249,drive/MyDrive/data/train_images/1250.jpg,42
3882,drive/MyDrive/data/train_images/3883.jpg,193
686,drive/MyDrive/data/train_images/687.jpg,23
1452,drive/MyDrive/data/train_images/1453.jpg,49
2357,drive/MyDrive/data/train_images/2358.jpg,85


### Define Image Transformations for Training and Validation

Before training a CNN, all images need to be preprocessed in a consistent way.  
Here, we define two sets of transformations:

**Training transforms**
- **Resize to 224×224:** ResNet models expect fixed-size input.
- **Random horizontal flip:** A simple data augmentation step to help the model generalize.
- **Convert to tensor:** Converts the image to a PyTorch tensor with values in `[0,1]`.
- **Normalize with ImageNet statistics:** Since ResNet18 was pretrained on ImageNet, the same normalization must be applied for best performance.

**Validation transforms**
- Same as above but **without augmentation**, to ensure a stable and deterministic evaluation.

These transforms prepare raw images so they can be passed into the CNN during training and validation.


In [22]:


# image transforms (basic baseline)
train_transform = T.Compose([
    T.Resize((224, 224)),
    T.RandomResizedCrop(224, scale=(0.7, 1.0)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(20),  # Increased from 15
    T.RandomAffine(
        degrees=0,
        translate=(0.15, 0.15),
        scale=(0.85, 1.15)
    ),
    T.ColorJitter(
        brightness=0.4,
        contrast=0.4,
        saturation=0.4,
        hue=0.15
    ),
    T.RandomGrayscale(p=0.1),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),

])

val_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
])

### Create a custom PyTorch Dataset for bird images

Here we define a `BirdsDataset` class that:
- Reads the image path and label from the DataFrame rows.
- Loads each image with PIL.
- Applies the appropriate transform (train or validation).
- Converts labels from 1–200 to 0–199 so they work with `nn.CrossEntropyLoss`.

This Dataset will be used together with a DataLoader to efficiently feed batches to the CNN.

In [23]:
class BirdsDataset(Dataset):
    def __init__(self, df, attributes_array, transform=None, use_attributes=False):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        #self.processor = processor
        self.use_attributes = use_attributes
        self.attributes = torch.FloatTensor(attributes_array)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row["image_path"]
        label = int(row["label"]) - 1

        img = Image.open(img_path).convert("RGB")
        #inputs = self.processor(images=img, return_tensors="pt")
        #pixel_values = inputs["pixel_values"].squeeze(0)
        pixel_values = self.transform(img)
        attr_vector = self.attributes[label]
        #if self.transform:
        #    img = self.transform(img)

        return pixel_values, label, attr_vector

### Wrap Datasets in DataLoaders

Now we create `DataLoader` objects for the training and validation sets.  
DataLoaders handle:
- Shuffling (for training),
- Batching,
- Parallel loading of images (with `num_workers`).

These will be used directly in the training and evaluation loops.


In [24]:
from torch.utils.data import DataLoader


train_dataset = BirdsDataset(train_images, transform=train_transform, attributes_array=attributes)
val_dataset   = BirdsDataset(val_images,   transform=val_transform, attributes_array=attributes)

batch_size = 64

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

### Define Device (GPU or CPU)

In [25]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


### Building a simple CNN from scratch (baseline CNN)
Before comparing with pretrained models like ResNet18, it's useful to build a classic convolutional neural network from scratch.  
This gives a "true baseline" — a model that only learns from the bird training images, without any prior ImageNet knowledge.

### CNN Architecture

A custom CNN with ~7M parameters featuring four convolutional blocks (64→128→256→512 channels, 2 convs each) followed by global average pooling. The backbone extracts 512-dimensional features which feed into two heads: (1) a cosine similarity classifier that compares normalized features against learned class prototypes for species prediction, and (2) an attribute prediction head that outputs 312 semantic attributes. This multi-task design forces the network to learn interpretable visual features while the cosine head provides better metric learning for fine-grained classification.


In [26]:
import torch.nn.functional as F
class ImprovedCNNWithCosineAndAttributes(nn.Module):
    def __init__(self, num_classes=200, num_attributes=312,
                 feat_dim=512, scale=10.0):
        super().__init__()

        self.backbone = nn.Sequential(
            # Block 1: 3 → 64
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 224 → 112

            # Block 2: 64 → 128
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 112 → 56

            # Block 3: 128 → 256
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 56 → 28

            # Block 4: 256 → 512
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 28 → 14

            # Global pooling
            nn.AdaptiveAvgPool2d((4, 4))
        )

        # Feature projection
        self.fc_features = nn.Sequential(
            nn.Dropout(0.5),  # Standard dropout, nothing fancy
            nn.Linear(512 * 4 * 4, feat_dim),
            nn.ReLU(inplace=True)
        )

        # Cosine similarity classification head
        self.cosine_head = CosineClassifier(
            num_classes=num_classes,
            feat_dim=feat_dim,
            scale=scale
        )

        # Attribute prediction head
        self.attr_head = nn.Sequential(
            nn.Dropout(0.5),  # Standard dropout
            nn.Linear(feat_dim, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_attributes)
        )

    def forward(self, x):
        x = self.backbone(x)
        x = x.flatten(1)
        feats = self.fc_features(x)

        class_logits = self.cosine_head(feats)
        attr_logits = self.attr_head(feats)

        return class_logits, attr_logits


class CosineClassifier(nn.Module):
    def __init__(self, num_classes, feat_dim, scale=10.0):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_classes, feat_dim))
        nn.init.xavier_normal_(self.weight)
        self.scale = scale

    def forward(self, feats):
        # Normalize features and weights
        feats_n = F.normalize(feats, dim=1)
        weight_n = F.normalize(self.weight, dim=1)

        # Cosine similarity scaled
        logits = self.scale * feats_n @ weight_n.t()
        return logits

train_one_epoch_multitask() <br>
Trains the model for one epoch. For each batch, performs forward pass to get classification and attribute predictions, computes two losses (classification + attribute), combines them with weighted sum (α=0.8), and updates weights via backpropagation. Returns average loss and accuracy.

In [27]:
from tqdm import tqdm

def train_one_epoch_multitask(model, loader, optimizer, criterion_class,
                              criterion_attr, full_attributes, device, alpha=0.7):
    """
    Train multi-task model for one epoch

    Args:
        model: Your CNNMultiTask or ImprovedCNNMultiTask model
        loader: Training DataLoader
        optimizer: Optimizer (e.g., Adam)
        criterion_class: Loss for classification (e.g., CrossEntropyLoss)
        criterion_attr: Loss for attributes (e.g., MSELoss)
        full_attributes: numpy array of shape (200, 312) - full attributes matrix
        device: torch.device (cuda or cpu)
        alpha: weight for classification loss (1-alpha for attribute loss)

    Returns:
        metrics: dict with 'total_loss', 'class_loss', 'attr_loss', 'accuracy'
    """
    model.train()
    attributes_gpu = torch.FloatTensor(full_attributes).to(device)

    # Initialize metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    # Create progress bar
    progress_bar = tqdm(loader, desc="Training", leave=False)

    for batch in progress_bar:
        # Handle both 2-value and 3-value dataset returns
        if len(batch) == 2:
            images, labels = batch
        else:  # len(batch) == 3
            images, labels, _ = batch  # Ignore third value if present
        # Check device BEFORE movin

        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)




        # Get ground truth attributes for this batch
        # labels is (batch_size,) with values [0, 199]
        # full_attributes[labels.cpu().numpy()] returns (batch_size, 312)
        batch_attrs = attributes_gpu[labels]

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        class_logits, attr_logits = model(images)

        # Calculate two losses
        loss_class = criterion_class(class_logits, labels)
        loss_attr = criterion_attr(attr_logits, batch_attrs)

        # Combined loss with weighting
        loss = alpha * loss_class + (1 - alpha) * loss_attr

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track metrics
        batch_size = images.size(0)
        total_loss += loss.item() * batch_size
        total_class_loss += loss_class.item() * batch_size
        total_attr_loss += loss_attr.item() * batch_size

        # Calculate accuracy
        _, preds = class_logits.max(1)
        class_correct += (preds == labels).sum().item()
        total += batch_size

        # Update progress bar with current metrics
        current_acc = class_correct / total
        progress_bar.set_postfix({
            'loss': f'{loss.item():.3f}',
            'acc': f'{current_acc:.3f}'
        })

    # Calculate average metrics over entire epoch
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total
    }

    return metrics

evaluate_multitask() <br>
Evaluates the model on validation data without updating weights. Computes the same metrics as training (classification loss, attribute loss, accuracy) to monitor generalization performance.

In [28]:
def evaluate_multitask(model, loader, criterion_class, criterion_attr,
                       full_attributes, device, alpha=0.7):
    """
    Evaluate multi-task model on validation/test set

    Args:
        Same as train_one_epoch_multitask

    Returns:
        metrics: dict with losses, accuracy, and predictions for analysis
    """
    model.eval()
    attributes_gpu = torch.FloatTensor(full_attributes).to(device)
    # Initialize metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    # Store predictions for confusion matrix later
    all_preds = []
    all_labels = []

    with torch.no_grad():  # No gradients needed for evaluation
        for batch in tqdm(loader, desc="Validation", leave=False):
            # Handle both 2-value and 3-value dataset returns
            if len(batch) == 2:
                images, labels = batch
            else:
                images, labels, _ = batch



            images = images.to(device)
            labels = labels.to(device)

            # Check device AFTER moving
            # Get ground truth attributes
            batch_attrs = attributes_gpu[labels]

            # Forward pass
            class_logits, attr_logits = model(images)

            # Calculate losses
            loss_class = criterion_class(class_logits, labels)
            loss_attr = criterion_attr(attr_logits, batch_attrs)
            loss = alpha * loss_class + (1 - alpha) * loss_attr

            # Track metrics
            batch_size = images.size(0)
            total_loss += loss.item() * batch_size
            total_class_loss += loss_class.item() * batch_size
            total_attr_loss += loss_attr.item() * batch_size

            # Calculate accuracy
            _, preds = class_logits.max(1)
            class_correct += (preds == labels).sum().item()
            total += batch_size

            # Store predictions and labels for later analysis
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate average metrics
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total,
        'predictions': all_preds,  # For confusion matrix
        'labels': all_labels        # For confusion matrix
    }

    return metrics

Runs for 80 epochs. Each epoch trains on all batches, evaluates on validation set, updates learning rate scheduler, and saves the best model checkpoint when validation accuracy improves.

In [30]:
num_epochs = 80
alpha = 0.8
best_val_acc = 0.0
attributes_full = np.load(glob_path + 'data/attributes.npy')

model = ImprovedCNNWithCosineAndAttributes(
    num_classes=200,
    num_attributes=312,
    feat_dim=512,
    scale=10.0
).to(device)

num_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {num_params:,}")
print(f"Expected: 7-10M parameters")

# Loss functions
criterion_class = nn.CrossEntropyLoss(label_smoothing=0.1)
criterion_attr = nn.MSELoss()

# Optimizer with weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,  # Higher learning rate
    weight_decay=1e-4
)

# Better scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=10,  # Restart every 10 epochs
    T_mult=2,  # Double the period after each restart
    eta_min=1e-6
)
# Training config
CONFIG = {
    'num_epochs': 80,  # You trained for 80
    'alpha': 0.8,      # 80% classification, 20% attributes
    'batch_size': 64,  # Assuming this
    'patience': 15,    # For early stopping
}

print("\n" + "="*70)
print("TRAINING CONFIGURATION (That got 18%)")
print("="*70)
print(f"Model: LargerCNNWithCosineAndAttributes")
print(f"Parameters: {num_params:,}")
print(f"Learning Rate: 1e-3")
print(f"Epochs: {CONFIG['num_epochs']}")
print(f"Alpha: {CONFIG['alpha']}")
print(f"Label Smoothing: 0.1")
print(f"Weight Decay: 1e-4")
print("="*70 + "\n")

# ============================================================================
# TRAINING LOOP WITH EARLY STOPPING
# ============================================================================

history = {
    'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [],
    'train_class_loss': [], 'train_attr_loss': [],
    'val_class_loss': [], 'val_attr_loss': []
}

best_val_acc = 0.0
best_epoch = 0
no_improve_count = 0

for epoch in range(1, CONFIG['num_epochs'] + 1):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch}/{CONFIG['num_epochs']}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
    print(f"{'='*70}")

    # Train one epoch
    train_metrics = train_one_epoch_multitask(
        model, train_loader, optimizer, criterion_class,
        criterion_attr, attributes_full, device, CONFIG['alpha']
    )

    # Validate
    val_metrics = evaluate_multitask(
        model, val_loader, criterion_class, criterion_attr,
        attributes_full, device, CONFIG['alpha']
    )

    # Update scheduler
    scheduler.step()

    # Store history
    history['train_loss'].append(train_metrics['total_loss'])
    history['train_acc'].append(train_metrics['accuracy'])
    history['val_loss'].append(val_metrics['total_loss'])
    history['val_acc'].append(val_metrics['accuracy'])
    history['train_class_loss'].append(train_metrics['class_loss'])
    history['train_attr_loss'].append(train_metrics['attr_loss'])
    history['val_class_loss'].append(val_metrics['class_loss'])
    history['val_attr_loss'].append(val_metrics['attr_loss'])

    # Print metrics
    print(f"\nTrain - Loss: {train_metrics['total_loss']:.4f} | "
          f"Acc: {train_metrics['accuracy']:.2f}% | "
          f"Class Loss: {train_metrics['class_loss']:.4f} | "
          f"Attr Loss: {train_metrics['attr_loss']:.4f}")

    print(f"Val   - Loss: {val_metrics['total_loss']:.4f} | "
          f"Acc: {val_metrics['accuracy']:.2f}% | "
          f"Class Loss: {val_metrics['class_loss']:.4f} | "
          f"Attr Loss: {val_metrics['attr_loss']:.4f}")

    # Save best model
    if val_metrics['accuracy'] > best_val_acc:
        best_val_acc = val_metrics['accuracy']
        best_epoch = epoch
        no_improve_count = 0

        save_path = 'checkpoints/best_model_18percent.pt'
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_acc': best_val_acc,
            'config': CONFIG,
            'history': history
        }, save_path)

        print(f"✓ New best model saved! Val Acc: {best_val_acc:.2f}%")
    else:
        no_improve_count += 1

    print(f"Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch})")
    print(f"No improvement: {no_improve_count}/{CONFIG['patience']} epochs")

    # Early stopping
    if no_improve_count >= CONFIG['patience']:
        print(f"\n{'='*70}")
        print(f"EARLY STOPPING at epoch {epoch}")
        print(f"Best validation accuracy: {best_val_acc:.2f}% at epoch {best_epoch}")
        print(f"{'='*70}")
        break

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print(f"Best Val Acc: {best_val_acc:.2f}% at epoch {best_epoch}")
print(f"Final Train Acc: {history['train_acc'][-1]:.2f}%")
print(f"Final Val Acc: {history['val_acc'][-1]:.2f}%")
print("="*70)

Model parameters: 9,409,144
Expected: 7-10M parameters

TRAINING CONFIGURATION (That got 18%)
Model: LargerCNNWithCosineAndAttributes
Parameters: 9,409,144
Learning Rate: 1e-3
Epochs: 80
Alpha: 0.8
Label Smoothing: 0.1
Weight Decay: 1e-4


Epoch 1/80
Learning Rate: 0.001000





Train - Loss: 4.2716 | Acc: 0.01% | Class Loss: 5.3310 | Attr Loss: 0.0342
Val   - Loss: 4.1576 | Acc: 0.01% | Class Loss: 5.1963 | Attr Loss: 0.0025
✓ New best model saved! Val Acc: 0.01%
Best Val Acc: 0.01% (Epoch 1)
No improvement: 0/15 epochs

Epoch 2/80
Learning Rate: 0.000976





Train - Loss: 4.1706 | Acc: 0.01% | Class Loss: 5.2127 | Attr Loss: 0.0023
Val   - Loss: 4.1023 | Acc: 0.03% | Class Loss: 5.1274 | Attr Loss: 0.0017
✓ New best model saved! Val Acc: 0.03%
Best Val Acc: 0.03% (Epoch 2)
No improvement: 0/15 epochs

Epoch 3/80
Learning Rate: 0.000905





Train - Loss: 4.1421 | Acc: 0.01% | Class Loss: 5.1771 | Attr Loss: 0.0018
Val   - Loss: 4.0642 | Acc: 0.02% | Class Loss: 5.0799 | Attr Loss: 0.0015
Best Val Acc: 0.03% (Epoch 2)
No improvement: 1/15 epochs

Epoch 4/80
Learning Rate: 0.000794





Train - Loss: 4.0969 | Acc: 0.02% | Class Loss: 5.1206 | Attr Loss: 0.0017
Val   - Loss: 4.0394 | Acc: 0.02% | Class Loss: 5.0489 | Attr Loss: 0.0014
Best Val Acc: 0.03% (Epoch 2)
No improvement: 2/15 epochs

Epoch 5/80
Learning Rate: 0.000655





Train - Loss: 4.0554 | Acc: 0.02% | Class Loss: 5.0688 | Attr Loss: 0.0016
Val   - Loss: 3.9810 | Acc: 0.03% | Class Loss: 4.9759 | Attr Loss: 0.0014
✓ New best model saved! Val Acc: 0.03%
Best Val Acc: 0.03% (Epoch 5)
No improvement: 0/15 epochs

Epoch 6/80
Learning Rate: 0.000501





Train - Loss: 4.0164 | Acc: 0.02% | Class Loss: 5.0201 | Attr Loss: 0.0015
Val   - Loss: 3.9413 | Acc: 0.04% | Class Loss: 4.9263 | Attr Loss: 0.0014
✓ New best model saved! Val Acc: 0.04%
Best Val Acc: 0.04% (Epoch 6)
No improvement: 0/15 epochs

Epoch 7/80
Learning Rate: 0.000346





Train - Loss: 3.9937 | Acc: 0.03% | Class Loss: 4.9917 | Attr Loss: 0.0015
Val   - Loss: 3.9196 | Acc: 0.04% | Class Loss: 4.8992 | Attr Loss: 0.0014
✓ New best model saved! Val Acc: 0.04%
Best Val Acc: 0.04% (Epoch 7)
No improvement: 0/15 epochs

Epoch 8/80
Learning Rate: 0.000207





Train - Loss: 3.9539 | Acc: 0.04% | Class Loss: 4.9420 | Attr Loss: 0.0014
Val   - Loss: 3.8918 | Acc: 0.05% | Class Loss: 4.8644 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.05%
Best Val Acc: 0.05% (Epoch 8)
No improvement: 0/15 epochs

Epoch 9/80
Learning Rate: 0.000096





Train - Loss: 3.9324 | Acc: 0.04% | Class Loss: 4.9151 | Attr Loss: 0.0015
Val   - Loss: 3.8828 | Acc: 0.05% | Class Loss: 4.8532 | Attr Loss: 0.0013
Best Val Acc: 0.05% (Epoch 8)
No improvement: 1/15 epochs

Epoch 10/80
Learning Rate: 0.000025





Train - Loss: 3.9222 | Acc: 0.04% | Class Loss: 4.9024 | Attr Loss: 0.0014
Val   - Loss: 3.8831 | Acc: 0.05% | Class Loss: 4.8536 | Attr Loss: 0.0013
Best Val Acc: 0.05% (Epoch 8)
No improvement: 2/15 epochs

Epoch 11/80
Learning Rate: 0.001000





Train - Loss: 3.9924 | Acc: 0.03% | Class Loss: 4.9902 | Attr Loss: 0.0015
Val   - Loss: 3.9701 | Acc: 0.05% | Class Loss: 4.9623 | Attr Loss: 0.0013
Best Val Acc: 0.05% (Epoch 8)
No improvement: 3/15 epochs

Epoch 12/80
Learning Rate: 0.000994





Train - Loss: 3.9899 | Acc: 0.02% | Class Loss: 4.9870 | Attr Loss: 0.0015
Val   - Loss: 3.8623 | Acc: 0.04% | Class Loss: 4.8275 | Attr Loss: 0.0013
Best Val Acc: 0.05% (Epoch 8)
No improvement: 4/15 epochs

Epoch 13/80
Learning Rate: 0.000976





Train - Loss: 3.9105 | Acc: 0.03% | Class Loss: 4.8878 | Attr Loss: 0.0015
Val   - Loss: 3.8700 | Acc: 0.06% | Class Loss: 4.8372 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.06%
Best Val Acc: 0.06% (Epoch 13)
No improvement: 0/15 epochs

Epoch 14/80
Learning Rate: 0.000946





Train - Loss: 3.8853 | Acc: 0.04% | Class Loss: 4.8562 | Attr Loss: 0.0015
Val   - Loss: 3.8676 | Acc: 0.06% | Class Loss: 4.8341 | Attr Loss: 0.0013
Best Val Acc: 0.06% (Epoch 13)
No improvement: 1/15 epochs

Epoch 15/80
Learning Rate: 0.000905





Train - Loss: 3.8715 | Acc: 0.04% | Class Loss: 4.8390 | Attr Loss: 0.0014
Val   - Loss: 3.7725 | Acc: 0.06% | Class Loss: 4.7152 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.06%
Best Val Acc: 0.06% (Epoch 15)
No improvement: 0/15 epochs

Epoch 16/80
Learning Rate: 0.000854





Train - Loss: 3.8224 | Acc: 0.04% | Class Loss: 4.7776 | Attr Loss: 0.0014
Val   - Loss: 3.7768 | Acc: 0.06% | Class Loss: 4.7207 | Attr Loss: 0.0013
Best Val Acc: 0.06% (Epoch 15)
No improvement: 1/15 epochs

Epoch 17/80
Learning Rate: 0.000794





Train - Loss: 3.7978 | Acc: 0.05% | Class Loss: 4.7469 | Attr Loss: 0.0014
Val   - Loss: 3.7513 | Acc: 0.07% | Class Loss: 4.6887 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.07%
Best Val Acc: 0.07% (Epoch 17)
No improvement: 0/15 epochs

Epoch 18/80
Learning Rate: 0.000727





Train - Loss: 3.7619 | Acc: 0.06% | Class Loss: 4.7021 | Attr Loss: 0.0014
Val   - Loss: 3.7303 | Acc: 0.08% | Class Loss: 4.6626 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.08%
Best Val Acc: 0.08% (Epoch 18)
No improvement: 0/15 epochs

Epoch 19/80
Learning Rate: 0.000655





Train - Loss: 3.7297 | Acc: 0.06% | Class Loss: 4.6618 | Attr Loss: 0.0014
Val   - Loss: 3.7550 | Acc: 0.08% | Class Loss: 4.6935 | Attr Loss: 0.0013
Best Val Acc: 0.08% (Epoch 18)
No improvement: 1/15 epochs

Epoch 20/80
Learning Rate: 0.000579





Train - Loss: 3.6943 | Acc: 0.07% | Class Loss: 4.6175 | Attr Loss: 0.0014
Val   - Loss: 3.6985 | Acc: 0.08% | Class Loss: 4.6228 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.08%
Best Val Acc: 0.08% (Epoch 20)
No improvement: 0/15 epochs

Epoch 21/80
Learning Rate: 0.000501





Train - Loss: 3.6669 | Acc: 0.06% | Class Loss: 4.5833 | Attr Loss: 0.0014
Val   - Loss: 3.6804 | Acc: 0.09% | Class Loss: 4.6001 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.09%
Best Val Acc: 0.09% (Epoch 21)
No improvement: 0/15 epochs

Epoch 22/80
Learning Rate: 0.000422





Train - Loss: 3.6414 | Acc: 0.08% | Class Loss: 4.5514 | Attr Loss: 0.0014
Val   - Loss: 3.6929 | Acc: 0.09% | Class Loss: 4.6158 | Attr Loss: 0.0013
Best Val Acc: 0.09% (Epoch 21)
No improvement: 1/15 epochs

Epoch 23/80
Learning Rate: 0.000346





Train - Loss: 3.6125 | Acc: 0.08% | Class Loss: 4.5152 | Attr Loss: 0.0014
Val   - Loss: 3.6265 | Acc: 0.11% | Class Loss: 4.5328 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.11%
Best Val Acc: 0.11% (Epoch 23)
No improvement: 0/15 epochs

Epoch 24/80
Learning Rate: 0.000274





Train - Loss: 3.5927 | Acc: 0.09% | Class Loss: 4.4906 | Attr Loss: 0.0014
Val   - Loss: 3.6120 | Acc: 0.10% | Class Loss: 4.5147 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 23)
No improvement: 1/15 epochs

Epoch 25/80
Learning Rate: 0.000207





Train - Loss: 3.5394 | Acc: 0.10% | Class Loss: 4.4239 | Attr Loss: 0.0014
Val   - Loss: 3.6116 | Acc: 0.11% | Class Loss: 4.5141 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.11%
Best Val Acc: 0.11% (Epoch 25)
No improvement: 0/15 epochs

Epoch 26/80
Learning Rate: 0.000147





Train - Loss: 3.5524 | Acc: 0.09% | Class Loss: 4.4402 | Attr Loss: 0.0014
Val   - Loss: 3.6024 | Acc: 0.10% | Class Loss: 4.5027 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 25)
No improvement: 1/15 epochs

Epoch 27/80
Learning Rate: 0.000096





Train - Loss: 3.5100 | Acc: 0.11% | Class Loss: 4.3872 | Attr Loss: 0.0014
Val   - Loss: 3.5937 | Acc: 0.11% | Class Loss: 4.4918 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 25)
No improvement: 2/15 epochs

Epoch 28/80
Learning Rate: 0.000055





Train - Loss: 3.5014 | Acc: 0.11% | Class Loss: 4.3765 | Attr Loss: 0.0014
Val   - Loss: 3.6079 | Acc: 0.10% | Class Loss: 4.5095 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 25)
No improvement: 3/15 epochs

Epoch 29/80
Learning Rate: 0.000025





Train - Loss: 3.5011 | Acc: 0.11% | Class Loss: 4.3760 | Attr Loss: 0.0014
Val   - Loss: 3.5893 | Acc: 0.11% | Class Loss: 4.4863 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 25)
No improvement: 4/15 epochs

Epoch 30/80
Learning Rate: 0.000007





Train - Loss: 3.4988 | Acc: 0.10% | Class Loss: 4.3731 | Attr Loss: 0.0014
Val   - Loss: 3.5868 | Acc: 0.11% | Class Loss: 4.4831 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.11%
Best Val Acc: 0.11% (Epoch 30)
No improvement: 0/15 epochs

Epoch 31/80
Learning Rate: 0.001000





Train - Loss: 3.6720 | Acc: 0.07% | Class Loss: 4.5897 | Attr Loss: 0.0014
Val   - Loss: 3.7371 | Acc: 0.09% | Class Loss: 4.6710 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 1/15 epochs

Epoch 32/80
Learning Rate: 0.000998





Train - Loss: 3.6455 | Acc: 0.08% | Class Loss: 4.5565 | Attr Loss: 0.0014
Val   - Loss: 3.6284 | Acc: 0.10% | Class Loss: 4.5352 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 2/15 epochs

Epoch 33/80
Learning Rate: 0.000994





Train - Loss: 3.5924 | Acc: 0.08% | Class Loss: 4.4901 | Attr Loss: 0.0014
Val   - Loss: 3.6315 | Acc: 0.10% | Class Loss: 4.5391 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 3/15 epochs

Epoch 34/80
Learning Rate: 0.000986





Train - Loss: 3.5686 | Acc: 0.08% | Class Loss: 4.4604 | Attr Loss: 0.0014
Val   - Loss: 3.6114 | Acc: 0.09% | Class Loss: 4.5140 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 4/15 epochs

Epoch 35/80
Learning Rate: 0.000976





Train - Loss: 3.5296 | Acc: 0.08% | Class Loss: 4.4117 | Attr Loss: 0.0014
Val   - Loss: 3.6054 | Acc: 0.10% | Class Loss: 4.5065 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 5/15 epochs

Epoch 36/80
Learning Rate: 0.000962





Train - Loss: 3.5137 | Acc: 0.10% | Class Loss: 4.3918 | Attr Loss: 0.0014
Val   - Loss: 3.6177 | Acc: 0.10% | Class Loss: 4.5218 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 6/15 epochs

Epoch 37/80
Learning Rate: 0.000946





Train - Loss: 3.5001 | Acc: 0.10% | Class Loss: 4.3747 | Attr Loss: 0.0014
Val   - Loss: 3.6030 | Acc: 0.10% | Class Loss: 4.5035 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 7/15 epochs

Epoch 38/80
Learning Rate: 0.000926





Train - Loss: 3.4639 | Acc: 0.10% | Class Loss: 4.3296 | Attr Loss: 0.0014
Val   - Loss: 3.5696 | Acc: 0.10% | Class Loss: 4.4617 | Attr Loss: 0.0013
Best Val Acc: 0.11% (Epoch 30)
No improvement: 8/15 epochs

Epoch 39/80
Learning Rate: 0.000905





Train - Loss: 3.4571 | Acc: 0.11% | Class Loss: 4.3210 | Attr Loss: 0.0014
Val   - Loss: 3.5430 | Acc: 0.13% | Class Loss: 4.4284 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.13%
Best Val Acc: 0.13% (Epoch 39)
No improvement: 0/15 epochs

Epoch 40/80
Learning Rate: 0.000880





Train - Loss: 3.4087 | Acc: 0.11% | Class Loss: 4.2606 | Attr Loss: 0.0014
Val   - Loss: 3.6717 | Acc: 0.08% | Class Loss: 4.5892 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 1/15 epochs

Epoch 41/80
Learning Rate: 0.000854





Train - Loss: 3.4008 | Acc: 0.11% | Class Loss: 4.2507 | Attr Loss: 0.0014
Val   - Loss: 3.4741 | Acc: 0.11% | Class Loss: 4.3423 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 2/15 epochs

Epoch 42/80
Learning Rate: 0.000825





Train - Loss: 3.3793 | Acc: 0.13% | Class Loss: 4.2238 | Attr Loss: 0.0014
Val   - Loss: 3.5051 | Acc: 0.12% | Class Loss: 4.3811 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 3/15 epochs

Epoch 43/80
Learning Rate: 0.000794





Train - Loss: 3.3402 | Acc: 0.12% | Class Loss: 4.1750 | Attr Loss: 0.0013
Val   - Loss: 3.5127 | Acc: 0.12% | Class Loss: 4.3905 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 4/15 epochs

Epoch 44/80
Learning Rate: 0.000761





Train - Loss: 3.3423 | Acc: 0.13% | Class Loss: 4.1776 | Attr Loss: 0.0014
Val   - Loss: 3.5280 | Acc: 0.11% | Class Loss: 4.4097 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 5/15 epochs

Epoch 45/80
Learning Rate: 0.000727





Train - Loss: 3.3042 | Acc: 0.14% | Class Loss: 4.1299 | Attr Loss: 0.0014
Val   - Loss: 3.4871 | Acc: 0.11% | Class Loss: 4.3586 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 6/15 epochs

Epoch 46/80
Learning Rate: 0.000692





Train - Loss: 3.2857 | Acc: 0.14% | Class Loss: 4.1068 | Attr Loss: 0.0013
Val   - Loss: 3.4511 | Acc: 0.13% | Class Loss: 4.3136 | Attr Loss: 0.0013
Best Val Acc: 0.13% (Epoch 39)
No improvement: 7/15 epochs

Epoch 47/80
Learning Rate: 0.000655





Train - Loss: 3.2736 | Acc: 0.15% | Class Loss: 4.0916 | Attr Loss: 0.0014
Val   - Loss: 3.4739 | Acc: 0.13% | Class Loss: 4.3420 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.13%
Best Val Acc: 0.13% (Epoch 47)
No improvement: 0/15 epochs

Epoch 48/80
Learning Rate: 0.000617





Train - Loss: 3.2370 | Acc: 0.16% | Class Loss: 4.0460 | Attr Loss: 0.0014
Val   - Loss: 3.4012 | Acc: 0.14% | Class Loss: 4.2512 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.14%
Best Val Acc: 0.14% (Epoch 48)
No improvement: 0/15 epochs

Epoch 49/80
Learning Rate: 0.000579





Train - Loss: 3.2069 | Acc: 0.16% | Class Loss: 4.0083 | Attr Loss: 0.0014
Val   - Loss: 3.4583 | Acc: 0.13% | Class Loss: 4.3226 | Attr Loss: 0.0013
Best Val Acc: 0.14% (Epoch 48)
No improvement: 1/15 epochs

Epoch 50/80
Learning Rate: 0.000540





Train - Loss: 3.1654 | Acc: 0.18% | Class Loss: 3.9564 | Attr Loss: 0.0014
Val   - Loss: 3.4069 | Acc: 0.14% | Class Loss: 4.2583 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.14%
Best Val Acc: 0.14% (Epoch 50)
No improvement: 0/15 epochs

Epoch 51/80
Learning Rate: 0.000501





Train - Loss: 3.1627 | Acc: 0.17% | Class Loss: 3.9531 | Attr Loss: 0.0014
Val   - Loss: 3.3633 | Acc: 0.15% | Class Loss: 4.2038 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.15%
Best Val Acc: 0.15% (Epoch 51)
No improvement: 0/15 epochs

Epoch 52/80
Learning Rate: 0.000461





Train - Loss: 3.1496 | Acc: 0.17% | Class Loss: 3.9366 | Attr Loss: 0.0014
Val   - Loss: 3.3598 | Acc: 0.16% | Class Loss: 4.1995 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.16%
Best Val Acc: 0.16% (Epoch 52)
No improvement: 0/15 epochs

Epoch 53/80
Learning Rate: 0.000422





Train - Loss: 3.1225 | Acc: 0.18% | Class Loss: 3.9028 | Attr Loss: 0.0014
Val   - Loss: 3.3429 | Acc: 0.14% | Class Loss: 4.1783 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 52)
No improvement: 1/15 epochs

Epoch 54/80
Learning Rate: 0.000384





Train - Loss: 3.1059 | Acc: 0.20% | Class Loss: 3.8820 | Attr Loss: 0.0013
Val   - Loss: 3.3252 | Acc: 0.15% | Class Loss: 4.1562 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 52)
No improvement: 2/15 epochs

Epoch 55/80
Learning Rate: 0.000346





Train - Loss: 3.0866 | Acc: 0.19% | Class Loss: 3.8579 | Attr Loss: 0.0013
Val   - Loss: 3.3509 | Acc: 0.15% | Class Loss: 4.1883 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 52)
No improvement: 3/15 epochs

Epoch 56/80
Learning Rate: 0.000309





Train - Loss: 3.0561 | Acc: 0.21% | Class Loss: 3.8198 | Attr Loss: 0.0013
Val   - Loss: 3.3387 | Acc: 0.15% | Class Loss: 4.1730 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 52)
No improvement: 4/15 epochs

Epoch 57/80
Learning Rate: 0.000274





Train - Loss: 3.0300 | Acc: 0.22% | Class Loss: 3.7871 | Attr Loss: 0.0014
Val   - Loss: 3.2997 | Acc: 0.16% | Class Loss: 4.1243 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.16%
Best Val Acc: 0.16% (Epoch 57)
No improvement: 0/15 epochs

Epoch 58/80
Learning Rate: 0.000240





Train - Loss: 3.0398 | Acc: 0.21% | Class Loss: 3.7994 | Attr Loss: 0.0013
Val   - Loss: 3.3172 | Acc: 0.15% | Class Loss: 4.1462 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 57)
No improvement: 1/15 epochs

Epoch 59/80
Learning Rate: 0.000207





Train - Loss: 3.0337 | Acc: 0.21% | Class Loss: 3.7917 | Attr Loss: 0.0013
Val   - Loss: 3.2960 | Acc: 0.16% | Class Loss: 4.1196 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.16%
Best Val Acc: 0.16% (Epoch 59)
No improvement: 0/15 epochs

Epoch 60/80
Learning Rate: 0.000176





Train - Loss: 3.0089 | Acc: 0.22% | Class Loss: 3.7608 | Attr Loss: 0.0013
Val   - Loss: 3.3154 | Acc: 0.16% | Class Loss: 4.1439 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 59)
No improvement: 1/15 epochs

Epoch 61/80
Learning Rate: 0.000147





Train - Loss: 3.0014 | Acc: 0.22% | Class Loss: 3.7514 | Attr Loss: 0.0013
Val   - Loss: 3.3198 | Acc: 0.15% | Class Loss: 4.1494 | Attr Loss: 0.0013
Best Val Acc: 0.16% (Epoch 59)
No improvement: 2/15 epochs

Epoch 62/80
Learning Rate: 0.000121





Train - Loss: 2.9757 | Acc: 0.23% | Class Loss: 3.7193 | Attr Loss: 0.0013
Val   - Loss: 3.2810 | Acc: 0.17% | Class Loss: 4.1009 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.17%
Best Val Acc: 0.17% (Epoch 62)
No improvement: 0/15 epochs

Epoch 63/80
Learning Rate: 0.000096





Train - Loss: 2.9733 | Acc: 0.23% | Class Loss: 3.7162 | Attr Loss: 0.0013
Val   - Loss: 3.2813 | Acc: 0.17% | Class Loss: 4.1013 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 62)
No improvement: 1/15 epochs

Epoch 64/80
Learning Rate: 0.000075





Train - Loss: 2.9548 | Acc: 0.23% | Class Loss: 3.6932 | Attr Loss: 0.0013
Val   - Loss: 3.2726 | Acc: 0.17% | Class Loss: 4.0904 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.17%
Best Val Acc: 0.17% (Epoch 64)
No improvement: 0/15 epochs

Epoch 65/80
Learning Rate: 0.000055





Train - Loss: 2.9639 | Acc: 0.24% | Class Loss: 3.7046 | Attr Loss: 0.0013
Val   - Loss: 3.2658 | Acc: 0.17% | Class Loss: 4.0819 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 64)
No improvement: 1/15 epochs

Epoch 66/80
Learning Rate: 0.000039





Train - Loss: 2.9465 | Acc: 0.24% | Class Loss: 3.6828 | Attr Loss: 0.0013
Val   - Loss: 3.2754 | Acc: 0.17% | Class Loss: 4.0939 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 64)
No improvement: 2/15 epochs

Epoch 67/80
Learning Rate: 0.000025





Train - Loss: 2.9670 | Acc: 0.23% | Class Loss: 3.7085 | Attr Loss: 0.0013
Val   - Loss: 3.2569 | Acc: 0.16% | Class Loss: 4.0708 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 64)
No improvement: 3/15 epochs

Epoch 68/80
Learning Rate: 0.000015





Train - Loss: 2.9550 | Acc: 0.24% | Class Loss: 3.6934 | Attr Loss: 0.0013
Val   - Loss: 3.2851 | Acc: 0.17% | Class Loss: 4.1060 | Attr Loss: 0.0013
✓ New best model saved! Val Acc: 0.17%
Best Val Acc: 0.17% (Epoch 68)
No improvement: 0/15 epochs

Epoch 69/80
Learning Rate: 0.000007





Train - Loss: 2.9382 | Acc: 0.24% | Class Loss: 3.6724 | Attr Loss: 0.0013
Val   - Loss: 3.2815 | Acc: 0.17% | Class Loss: 4.1015 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 1/15 epochs

Epoch 70/80
Learning Rate: 0.000003





Train - Loss: 2.9570 | Acc: 0.24% | Class Loss: 3.6959 | Attr Loss: 0.0013
Val   - Loss: 3.2633 | Acc: 0.17% | Class Loss: 4.0788 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 2/15 epochs

Epoch 71/80
Learning Rate: 0.001000





Train - Loss: 3.0963 | Acc: 0.20% | Class Loss: 3.8700 | Attr Loss: 0.0013
Val   - Loss: 3.3405 | Acc: 0.15% | Class Loss: 4.1753 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 3/15 epochs

Epoch 72/80
Learning Rate: 0.001000





Train - Loss: 3.1039 | Acc: 0.19% | Class Loss: 3.8795 | Attr Loss: 0.0013
Val   - Loss: 3.3879 | Acc: 0.13% | Class Loss: 4.2345 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 4/15 epochs

Epoch 73/80
Learning Rate: 0.000998





Train - Loss: 3.1326 | Acc: 0.18% | Class Loss: 3.9154 | Attr Loss: 0.0013
Val   - Loss: 3.3332 | Acc: 0.15% | Class Loss: 4.1662 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 5/15 epochs

Epoch 74/80
Learning Rate: 0.000997





Train - Loss: 3.0485 | Acc: 0.21% | Class Loss: 3.8103 | Attr Loss: 0.0013
Val   - Loss: 3.3114 | Acc: 0.16% | Class Loss: 4.1389 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 6/15 epochs

Epoch 75/80
Learning Rate: 0.000994





Train - Loss: 3.0698 | Acc: 0.18% | Class Loss: 3.8369 | Attr Loss: 0.0013
Val   - Loss: 3.3689 | Acc: 0.13% | Class Loss: 4.2107 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 7/15 epochs

Epoch 76/80
Learning Rate: 0.000990





Train - Loss: 3.0319 | Acc: 0.20% | Class Loss: 3.7895 | Attr Loss: 0.0013
Val   - Loss: 3.3065 | Acc: 0.17% | Class Loss: 4.1328 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 8/15 epochs

Epoch 77/80
Learning Rate: 0.000986





Train - Loss: 3.0148 | Acc: 0.21% | Class Loss: 3.7681 | Attr Loss: 0.0013
Val   - Loss: 3.2683 | Acc: 0.16% | Class Loss: 4.0850 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 9/15 epochs

Epoch 78/80
Learning Rate: 0.000981





Train - Loss: 2.9938 | Acc: 0.21% | Class Loss: 3.7419 | Attr Loss: 0.0013
Val   - Loss: 3.2751 | Acc: 0.17% | Class Loss: 4.0936 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 10/15 epochs

Epoch 79/80
Learning Rate: 0.000976





Train - Loss: 2.9916 | Acc: 0.23% | Class Loss: 3.7392 | Attr Loss: 0.0013
Val   - Loss: 3.2419 | Acc: 0.17% | Class Loss: 4.0520 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 11/15 epochs

Epoch 80/80
Learning Rate: 0.000969


                                                           


Train - Loss: 2.9652 | Acc: 0.22% | Class Loss: 3.7062 | Attr Loss: 0.0013
Val   - Loss: 3.2640 | Acc: 0.17% | Class Loss: 4.0797 | Attr Loss: 0.0013
Best Val Acc: 0.17% (Epoch 68)
No improvement: 12/15 epochs

TRAINING COMPLETE!
Best Val Acc: 0.17% at epoch 68
Final Train Acc: 0.22%
Final Val Acc: 0.17%




In [37]:
checkpoint = torch.load('checkpoints/best_model_18percent.pt')
model.load_state_dict(checkpoint['model_state_dict'])

print(f"Using model from epoch {checkpoint['epoch']}")
print(f"Val accuracy: {checkpoint['val_acc']:.2f}%")

Using model from epoch 68
Val accuracy: 0.17%


In [34]:
class TestDataset(Dataset):
    def __init__(self, test_df, transform):
        self.test_df = test_df
        self.transform = transform

    def __len__(self):
        return len(self.test_df)

    def __getitem__(self, idx):
        img_path = self.test_df.iloc[idx]['image_path']
        img_id = self.test_df.iloc[idx]['id']

        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, img_id

test_df = pd.read_csv(glob_path + 'data/test_images_path.csv')
test_df['image_path'] = glob_path + 'data' + test_df['image_path']
test_dataset = TestDataset(test_df, transform=val_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [35]:
test_df

Unnamed: 0,id,image_path,label
0,1,drive/MyDrive/data/test_images/999.jpg,1
1,2,drive/MyDrive/data/test_images/998.jpg,1
2,3,drive/MyDrive/data/test_images/997.jpg,1
3,4,drive/MyDrive/data/test_images/996.jpg,1
4,5,drive/MyDrive/data/test_images/995.jpg,1
...,...,...,...
3995,3996,drive/MyDrive/data/test_images/1001.jpg,1
3996,3997,drive/MyDrive/data/test_images/1000.jpg,1
3997,3998,drive/MyDrive/data/test_images/100.jpg,1
3998,3999,drive/MyDrive/data/test_images/10.jpg,1


In [36]:
def generate_submission(model, test_loader, device, output_file='submission.csv'):
    """
    Generate Kaggle submission file

    IMPORTANT: Attributes are NOT needed at test time!
    """
    model.eval()

    all_ids = []
    all_predictions = []

    print("Generating predictions...")
    with torch.no_grad():
        for images, img_ids in tqdm(test_loader):
            images = images.to(device)

            # Forward pass - only need classification logits
            class_logits, _ = model(images)  # Ignore attribute predictions

            # Get predicted classes
            predictions = torch.argmax(class_logits, dim=1)

            # Convert from 0-indexed to 1-indexed
            predictions = predictions.cpu().numpy() + 1

            all_ids.extend(img_ids.numpy())
            all_predictions.extend(predictions)

    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'id': all_ids,
        'label': all_predictions
    })

    submission_df.to_csv(output_file, index=False)
    print(f"✅ Saved {len(submission_df)} predictions to {output_file}")

    return submission_df

# Generate submission
submission = generate_submission(
    model=model,
    test_loader=test_loader,
    device=device,
    output_file='submission_cosine_attributes.csv'
)

Generating predictions...


100%|██████████| 63/63 [46:29<00:00, 44.28s/it]

✅ Saved 4000 predictions to submission_cosine_attributes.csv





In [None]:
final = pd.read_csv('submission_cosine_attributes.csv')

In [None]:
final

Unnamed: 0,id,label
0,1,68
1,2,115
2,3,7
3,4,28
4,5,19
...,...,...
3995,3996,68
3996,3997,22
3997,3998,37
3998,3999,89


The function below generates a fully fledges summary plot, keeping track of the loss functions and classification accuracy

In [2]:
import matplotlib.pyplot as plt
def plot_comprehensive_training_history(history, save_path='training_analysis.png'):
    """
    Comprehensive training visualization with diagnostics

    Args:
        history: dict with keys 'train_loss', 'val_loss', 'train_acc', 'val_acc',
                'train_class_loss', 'val_class_loss', 'train_attr_loss', 'val_attr_loss'
        save_path: where to save the plot
    """

    # Create figure with subplots
    fig = plt.figure(figsize=(20, 12))

    epochs = range(1, len(history['train_loss']) + 1)

    # ========================================================================
    # 1. TOTAL LOSS
    # ========================================================================
    ax1 = plt.subplot(3, 3, 1)
    ax1.plot(epochs, history['train_loss'], 'b-', label='Train Loss', linewidth=2)
    ax1.plot(epochs, history['val_loss'], 'r-', label='Val Loss', linewidth=2)
    ax1.set_title('Total Loss', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Add best epoch marker
    best_epoch = np.argmin(history['val_loss']) + 1
    best_val_loss = min(history['val_loss'])
    ax1.axvline(x=best_epoch, color='g', linestyle='--', alpha=0.5, label=f'Best: Epoch {best_epoch}')
    ax1.plot(best_epoch, best_val_loss, 'g*', markersize=15)

    # ========================================================================
    # 2. ACCURACY
    # ========================================================================
    ax2 = plt.subplot(3, 3, 2)
    ax2.plot(epochs, history['train_acc'], 'b-', label='Train Acc', linewidth=2)
    ax2.plot(epochs, history['val_acc'], 'r-', label='Val Acc', linewidth=2)
    ax2.set_title('Classification Accuracy', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    # Add best epoch marker
    best_epoch_acc = np.argmax(history['val_acc']) + 1
    best_val_acc = max(history['val_acc'])
    ax2.axvline(x=best_epoch_acc, color='g', linestyle='--', alpha=0.5, label=f'Best: Epoch {best_epoch_acc}')
    ax2.plot(best_epoch_acc, best_val_acc, 'g*', markersize=15)

    # ========================================================================
    # 3. CLASSIFICATION LOSS
    # ========================================================================
    ax3 = plt.subplot(3, 3, 3)
    ax3.plot(epochs, history['train_class_loss'], 'b-', label='Train', linewidth=2)
    ax3.plot(epochs, history['val_class_loss'], 'r-', label='Val', linewidth=2)
    ax3.set_title('Classification Loss (CrossEntropy)', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Epoch')
    ax3.set_ylabel('Loss')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

    # ========================================================================
    # 4. ATTRIBUTE LOSS
    # ========================================================================
    ax4 = plt.subplot(3, 3, 4)
    ax4.plot(epochs, history['train_attr_loss'], 'b-', label='Train', linewidth=2)
    ax4.plot(epochs, history['val_attr_loss'], 'r-', label='Val', linewidth=2)
    ax4.set_title('Attribute Loss (MSE)', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Epoch')
    ax4.set_ylabel('Loss')
    ax4.legend()
    ax4.grid(True, alpha=0.3)

    # ========================================================================
    # 5. OVERFITTING ANALYSIS (Train-Val Gap)
    # ========================================================================
    ax5 = plt.subplot(3, 3, 5)
    acc_gap = [train - val for train, val in zip(history['train_acc'], history['val_acc'])]
    ax5.plot(epochs, acc_gap, 'purple', linewidth=2)
    ax5.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    ax5.axhline(y=10, color='orange', linestyle='--', alpha=0.5, label='Moderate Overfit Threshold')
    ax5.axhline(y=20, color='red', linestyle='--', alpha=0.5, label='Severe Overfit Threshold')
    ax5.set_title('Overfitting Analysis (Train - Val Acc)', fontsize=14, fontweight='bold')
    ax5.set_xlabel('Epoch')
    ax5.set_ylabel('Accuracy Gap (%)')
    ax5.legend(fontsize=8)
    ax5.grid(True, alpha=0.3)
    ax5.fill_between(epochs, 0, acc_gap, where=[g > 0 for g in acc_gap],
                     color='red', alpha=0.2, label='Overfitting')

    # ========================================================================
    # 6. LEARNING RATE (if available)
    # ========================================================================
    ax6 = plt.subplot(3, 3, 6)
    if 'lr' in history:
        ax6.plot(epochs, history['lr'], 'g-', linewidth=2)
        ax6.set_title('Learning Rate Schedule', fontsize=14, fontweight='bold')
        ax6.set_xlabel('Epoch')
        ax6.set_ylabel('Learning Rate')
        ax6.set_yscale('log')
        ax6.grid(True, alpha=0.3)
    else:
        ax6.text(0.5, 0.5, 'Learning Rate\nNot Tracked',
                ha='center', va='center', fontsize=12, transform=ax6.transAxes)
        ax6.set_xticks([])
        ax6.set_yticks([])

    # ========================================================================
    # 7. LOSS COMPARISON (Log Scale)
    # ========================================================================
    ax7 = plt.subplot(3, 3, 7)
    ax7.plot(epochs, history['train_loss'], 'b-', label='Train Total', linewidth=2, alpha=0.7)
    ax7.plot(epochs, history['val_loss'], 'r-', label='Val Total', linewidth=2, alpha=0.7)
    ax7.plot(epochs, history['train_class_loss'], 'b--', label='Train Class', linewidth=1.5, alpha=0.5)
    ax7.plot(epochs, history['train_attr_loss'], 'b:', label='Train Attr', linewidth=1.5, alpha=0.5)
    ax7.set_title('Loss Comparison (Log Scale)', fontsize=14, fontweight='bold')
    ax7.set_xlabel('Epoch')
    ax7.set_ylabel('Loss (log scale)')
    ax7.set_yscale('log')
    ax7.legend(fontsize=8)
    ax7.grid(True, alpha=0.3)

    # ========================================================================
    # 8. ACCURACY IMPROVEMENT RATE
    # ========================================================================
    ax8 = plt.subplot(3, 3, 8)

    # Calculate moving average improvement (smoothed)
    window = 5
    val_acc_smooth = np.convolve(history['val_acc'], np.ones(window)/window, mode='valid')
    improvement = np.diff(val_acc_smooth)

    ax8.plot(range(window, len(history['val_acc'])), improvement, 'b-', linewidth=2)
    ax8.axhline(y=0, color='red', linestyle='--', linewidth=1)
    ax8.set_title('Validation Accuracy Improvement Rate', fontsize=14, fontweight='bold')
    ax8.set_xlabel('Epoch')
    ax8.set_ylabel('Δ Acc (% per epoch)')
    ax8.grid(True, alpha=0.3)
    ax8.fill_between(range(window, len(history['val_acc'])), 0, improvement,
                     where=[i > 0 for i in improvement], color='green', alpha=0.3, label='Improving')
    ax8.fill_between(range(window, len(history['val_acc'])), 0, improvement,
                     where=[i < 0 for i in improvement], color='red', alpha=0.3, label='Declining')
    ax8.legend()

    # ========================================================================
    # 9. SUMMARY STATISTICS (Text)
    # ========================================================================
    ax9 = plt.subplot(3, 3, 9)
    ax9.axis('off')

    # Calculate statistics
    final_train_acc = history['train_acc'][-1]
    final_val_acc = history['val_acc'][-1]
    best_val_acc = max(history['val_acc'])
    best_epoch = np.argmax(history['val_acc']) + 1
    train_val_gap = final_train_acc - final_val_acc

    initial_val_acc = history['val_acc'][0]
    total_improvement = final_val_acc - initial_val_acc

    # Determine status
    if train_val_gap > 20:
        status = "SEVERE OVERFITTING"
        color = 'red'
    elif train_val_gap > 10:
        status = "MODERATE OVERFITTING"
        color = 'orange'
    elif final_train_acc < 25:
        status = "UNDERFITTING"
        color = 'orange'
    else:
        status = "HEALTHY TRAINING"
        color = 'green'

    summary_text = f"""
    TRAINING SUMMARY
    {'='*40}

    Final Performance:
    • Train Accuracy: {final_train_acc:.2f}%
    • Val Accuracy:   {final_val_acc:.2f}%
    • Train-Val Gap:  {train_val_gap:.2f}%

    Best Performance:
    • Best Val Acc:   {best_val_acc:.2f}%
    • Best Epoch:     {best_epoch}

    Learning Progress:
    • Initial Val Acc: {initial_val_acc:.2f}%
    • Final Val Acc:   {final_val_acc:.2f}%
    • Improvement:     +{total_improvement:.2f}%

    Diagnosis:
    • Status: {status}

    Recommendations:
    """

    if train_val_gap > 20:
        recommendations = """  → Add more regularization
  → Use early stopping
  → Increase dropout
  → More data augmentation"""
    elif train_val_gap > 10:
        recommendations = """  → Consider early stopping
  → Moderate regularization"""
    elif final_train_acc < 25:
        recommendations = """  → Increase model capacity
  → Raise learning rate
  → Train longer
  → Check data loading"""
    else:
        recommendations = """  → Continue training
  → Fine-tune hyperparameters
  → Try ensemble methods"""

    summary_text += recommendations

    ax9.text(0.05, 0.95, summary_text, transform=ax9.transAxes,
            fontsize=10, verticalalignment='top', fontfamily='monospace',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

    fig.suptitle('Comprehensive Training Analysis',
                fontsize=18, fontweight='bold', y=0.995)

    plt.tight_layout(rect=[0, 0, 1, 0.99])
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"✅ Saved comprehensive plot to {save_path}")
    plt.show()

    return fig


plot_comprehensive_training_history(history, save_path='training_analysis.png')

NameError: name 'history' is not defined