In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Let's load the different bird species from the `class_names.npy` file and then the attributes from `attributes.npy` which has for every class 312 features that are explained by the file `attributes.txt`.

In [6]:
glob_path = "drive/MyDrive/"

In [7]:
bird_classes = np.load(glob_path + "data/class_names.npy", allow_pickle=True).item()

In [8]:
attributes = np.load(glob_path + 'data/attributes.npy')
attributes.shape

(200, 312)

In [9]:
with open(glob_path + "data/attributes.txt", "r") as f:
    attribute_names = [line.strip() for line in f.readlines()]

attribute_names[:5]

['1 has_bill_shape::curved_(up_or_down)',
 '2 has_bill_shape::dagger',
 '3 has_bill_shape::hooked',
 '4 has_bill_shape::needle',
 '5 has_bill_shape::hooked_seabird']

Unify the attributes files to map for every bird species they're features

In [10]:
class_attributes = {}

for class_id in range(attributes.shape[0]):
    class_attributes[class_id + 1] = {
        attribute_names[i]: attributes[class_id, i] for i in range(len(attribute_names))
    }

Create a data frame `birds_df` with the class_id and the 312 attrbiutes of each bird class. Then merge it with the class name of each bird.

In [11]:
birds_df = pd.DataFrame.from_dict(class_attributes, orient="index")
birds_df.index.name = "class_id"
birds_df.reset_index(inplace=True)
birds_df.head()

Unnamed: 0,class_id,1 has_bill_shape::curved_(up_or_down),2 has_bill_shape::dagger,3 has_bill_shape::hooked,4 has_bill_shape::needle,5 has_bill_shape::hooked_seabird,6 has_bill_shape::spatulate,7 has_bill_shape::all-purpose,8 has_bill_shape::cone,9 has_bill_shape::specialized,...,303 has_crown_color::pink,304 has_crown_color::orange,305 has_crown_color::black,306 has_crown_color::white,307 has_crown_color::red,308 has_crown_color::buff,309 has_wing_pattern::solid,310 has_wing_pattern::spotted,311 has_wing_pattern::striped,312 has_wing_pattern::multi-colored
0,1,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.0,0.005319,0.0,...,0.0,0.005439,0.005439,0.228446,0.0,0.0,0.18602,0.009186,0.025262,0.020669
1,2,0.0,0.011332,0.009444,0.0,0.202095,0.041552,0.01511,0.005666,0.0,...,0.006291,0.0,0.111144,0.008388,0.0,0.046135,0.202572,0.002665,0.021323,0.058639
2,3,0.0,0.0,0.007425,0.0,0.002475,0.0,0.0,0.074247,0.14602,...,0.0,0.0,0.190411,0.012555,0.0,0.010462,0.203609,0.0,0.008853,0.017705
3,4,0.0,0.0,0.003861,0.0,0.003861,0.013514,0.005792,0.07336,0.138998,...,0.004885,0.0,0.190531,0.0,0.0,0.0,0.15275,0.00684,0.036478,0.043317
4,5,0.0,0.035088,0.0,0.0,0.0,0.0,0.102458,0.070177,0.0,...,0.0,0.0,0.204036,0.002458,0.002458,0.0,0.03164,0.002751,0.015132,0.1582


In [12]:
classes = pd.DataFrame.from_dict(bird_classes, orient="index").reset_index()
classes.columns = ["class", "id"]
classes.head()

Unnamed: 0,class,id
0,001.Black_footed_Albatross,1
1,002.Laysan_Albatross,2
2,003.Sooty_Albatross,3
3,004.Groove_billed_Ani,4
4,005.Crested_Auklet,5


In [13]:
birds_df = birds_df.merge(classes, left_on="class_id", right_on="id")
birds_df = birds_df.drop(columns=["id"])

# Reorder columns to have class_id and class first
cols = ["class_id", "class"] + [c for c in birds_df.columns if c not in ["class_id", "class"]]
birds_df = birds_df[cols]
birds_df.head()

Unnamed: 0,class_id,class,1 has_bill_shape::curved_(up_or_down),2 has_bill_shape::dagger,3 has_bill_shape::hooked,4 has_bill_shape::needle,5 has_bill_shape::hooked_seabird,6 has_bill_shape::spatulate,7 has_bill_shape::all-purpose,8 has_bill_shape::cone,...,303 has_crown_color::pink,304 has_crown_color::orange,305 has_crown_color::black,306 has_crown_color::white,307 has_crown_color::red,308 has_crown_color::buff,309 has_wing_pattern::solid,310 has_wing_pattern::spotted,311 has_wing_pattern::striped,312 has_wing_pattern::multi-colored
0,1,001.Black_footed_Albatross,0.010638,0.010638,0.007092,0.003546,0.138299,0.065603,0.0,0.005319,...,0.0,0.005439,0.005439,0.228446,0.0,0.0,0.18602,0.009186,0.025262,0.020669
1,2,002.Laysan_Albatross,0.0,0.011332,0.009444,0.0,0.202095,0.041552,0.01511,0.005666,...,0.006291,0.0,0.111144,0.008388,0.0,0.046135,0.202572,0.002665,0.021323,0.058639
2,3,003.Sooty_Albatross,0.0,0.0,0.007425,0.0,0.002475,0.0,0.0,0.074247,...,0.0,0.0,0.190411,0.012555,0.0,0.010462,0.203609,0.0,0.008853,0.017705
3,4,004.Groove_billed_Ani,0.0,0.0,0.003861,0.0,0.003861,0.013514,0.005792,0.07336,...,0.004885,0.0,0.190531,0.0,0.0,0.0,0.15275,0.00684,0.036478,0.043317
4,5,005.Crested_Auklet,0.0,0.035088,0.0,0.0,0.0,0.0,0.102458,0.070177,...,0.0,0.0,0.204036,0.002458,0.002458,0.0,0.03164,0.002751,0.015132,0.1582


In [14]:
images_df = pd.read_csv(glob_path + "data/train_images.csv")
images_df['image_path'] = glob_path + 'data' + images_df['image_path']
images_df.head()

Unnamed: 0,image_path,label
0,drive/MyDrive/data/train_images/1.jpg,1
1,drive/MyDrive/data/train_images/2.jpg,1
2,drive/MyDrive/data/train_images/3.jpg,1
3,drive/MyDrive/data/train_images/4.jpg,1
4,drive/MyDrive/data/train_images/5.jpg,1


### Load training metadata and create train/validation split

In this step, we load the `train_images.csv` file that contains the image paths and labels.  
Then we create a stratified train/validation split so that all 200 classes are represented proportionally in both sets.  
This split will be used to train the CNN on `train_images` and evaluate it on `val_images`.


In [15]:
train_images, val_images = train_test_split(
    images_df,
    test_size=0.2,
    stratify=images_df["label"],
    random_state=42
)

len(train_images), len(val_images)

(3140, 786)

In [16]:
train_images.head()

Unnamed: 0,image_path,label
1249,drive/MyDrive/data/train_images/1250.jpg,42
3882,drive/MyDrive/data/train_images/3883.jpg,193
686,drive/MyDrive/data/train_images/687.jpg,23
1452,drive/MyDrive/data/train_images/1453.jpg,49
2357,drive/MyDrive/data/train_images/2358.jpg,85


### Define Image Transformations for Training and Validation

Before training a CNN, all images need to be preprocessed in a consistent way.  
Here, we define two sets of transformations:

**Training transforms**
- **Resize to 224√ó224:** ResNet models expect fixed-size input.
- **Random horizontal flip:** A simple data augmentation step to help the model generalize.
- **Convert to tensor:** Converts the image to a PyTorch tensor with values in `[0,1]`.
- **Normalize with ImageNet statistics:** Since ResNet18 was pretrained on ImageNet, the same normalization must be applied for best performance.

**Validation transforms**
- Same as above but **without augmentation**, to ensure a stable and deterministic evaluation.

These transforms prepare raw images so they can be passed into the CNN during training and validation.


In [17]:
from PIL import Image
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

# image transforms (basic baseline)
train_transform = T.Compose([
    T.Resize((224, 224)),
    T.RandomResizedCrop(224, scale=(0.7, 1.0)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(20),  # Increased from 15
    T.RandomAffine(
        degrees=0,
        translate=(0.15, 0.15),
        scale=(0.85, 1.15)
    ),
    T.ColorJitter(
        brightness=0.4,
        contrast=0.4,
        saturation=0.4,
        hue=0.15
    ),
    T.RandomGrayscale(p=0.1),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),

])

val_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
])

### Create a custom PyTorch Dataset for bird images

Here we define a `BirdsDataset` class that:
- Reads the image path and label from the DataFrame rows.
- Loads each image with PIL.
- Applies the appropriate transform (train or validation).
- Converts labels from 1‚Äì200 to 0‚Äì199 so they work with `nn.CrossEntropyLoss`.

This Dataset will be used together with a DataLoader to efficiently feed batches to the CNN.

In [18]:
class BirdsDataset(Dataset):
    def __init__(self, df, attributes_array, transform=None, use_attributes=False):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        #self.processor = processor
        self.use_attributes = use_attributes
        self.attributes = torch.FloatTensor(attributes_array)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row["image_path"]
        label = int(row["label"]) - 1

        img = Image.open(img_path).convert("RGB")
        #inputs = self.processor(images=img, return_tensors="pt")
        #pixel_values = inputs["pixel_values"].squeeze(0)
        pixel_values = self.transform(img)
        attr_vector = self.attributes[label]
        #if self.transform:
        #    img = self.transform(img)

        return pixel_values, label, attr_vector

### Wrap Datasets in DataLoaders

Now we create `DataLoader` objects for the training and validation sets.  
DataLoaders handle:
- Shuffling (for training),
- Batching,
- Parallel loading of images (with `num_workers`).

These will be used directly in the training and evaluation loops.


In [27]:
from torch.utils.data import DataLoader


train_dataset = BirdsDataset(train_images, transform=train_transform, attributes_array=attributes)
val_dataset   = BirdsDataset(val_images,   transform=val_transform, attributes_array=attributes)

batch_size = 64

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

### Define Device (GPU or CPU)

In [28]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Define two models: Simple CNN and ResNet18

### Building a simple CNN from scratch (baseline CNN)

Before comparing with pretrained models like ResNet18, it's useful to build a classic convolutional neural network from scratch.  
This gives a "true baseline" ‚Äî a model that only learns from the bird training images, without any prior ImageNet knowledge.

The custom CNN below contains:
- Three convolutional blocks (Conv ‚Üí BatchNorm ‚Üí ReLU ‚Üí MaxPool)
- A flatten layer
- Two fully-connected layers
- A final output layer with 200 logits (one per bird species)

This model is lightweight, easy to understand, and suitable for verifying that the training loop and data pipeline work correctly.


In [29]:
print(torch.__version__)

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

if device.type == 'cpu':
    print("‚ùå CUDA NOT AVAILABLE - Using CPU")
    print("Your PyTorch installation doesn't have CUDA support!")
    exit()

# Simple model
model = nn.Sequential(
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 10)
).to(device)

print(f"Model device: {next(model.parameters()).device}")

# Generate random data
x = torch.randn(32, 100).to(device)
y = torch.randint(0, 10, (32,)).to(device)

print(f"Input device: {x.device}")
print(f"Target device: {y.device}")

# Train for a few steps
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

print("\nTraining for 100 steps...")
for i in range(100):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()

    if (i + 1) % 20 == 0:
        print(f"Step {i+1}: Loss = {loss.item():.4f}")

print("\n‚úÖ Training worked! Check nvidia-smi to see GPU usage.")

2.9.0+cu126
Device: cuda
Model device: cuda:0
Input device: cuda:0
Target device: cuda:0

Training for 100 steps...
Step 20: Loss = 1.5672
Step 40: Loss = 0.8478
Step 60: Loss = 0.3420
Step 80: Loss = 0.1307
Step 100: Loss = 0.0628

‚úÖ Training worked! Check nvidia-smi to see GPU usage.


In [None]:
"""
    NEGEEEEEEEEEEEEEEEEEER DIT
"""

class ImprovedCNNMultiTask(nn.Module):
    def __init__(self, num_classes=200, num_attributes=312):
        super().__init__()

        # Block 1: 3 ‚Üí 64
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)  # 224 ‚Üí 112
        )

        # Block 2: 64 ‚Üí 128
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)  # 112 ‚Üí 56
        )

        # Block 3: 128 ‚Üí 256
        self.conv3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2)  # 56 ‚Üí 28
        )

        # Block 4: 256 ‚Üí 512
        self.conv4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2)  # 28 ‚Üí 14
        )

        # Calculate the flattened size dynamically
        self._to_linear = None
        self._get_conv_output_size()

        # Feature extraction
        self.feature_fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self._to_linear, 1024),
            nn.ReLU()
        )

        # Task 1: Classification head
        self.class_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

        # Task 2: Attribute prediction head
        self.attr_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_attributes)
        )

    def _get_conv_output_size(self):
        """Calculate the output size after conv layers"""
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, 224, 224)
            x = self.conv1(dummy_input)
            x = self.conv2(x)
            x = self.conv3(x)
            x = self.conv4(x)
            self._to_linear = x.view(1, -1).size(1)
            print(f"Flattened feature size: {self._to_linear}")

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = torch.flatten(x, 1)
        features = self.feature_fc(x)

        class_logits = self.class_head(features)
        attr_logits = self.attr_head(features)

        return class_logits, attr_logits

In [30]:
class ImprovedCNNMultiTask(nn.Module):
    def __init__(self, num_classes=200, num_attributes=312):
        super().__init__()

         # Convolutional feature extractor
        self.features = nn.Sequential(
            # Block 1: 3 ‚Üí 64
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 224 ‚Üí 112

            # Block 2: 64 ‚Üí 128
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 112 ‚Üí 56

            # Block 3: 128 ‚Üí 256
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 56 ‚Üí 28

            # Block 4: 256 ‚Üí 512
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 28 ‚Üí 14

            # Block 5: 512 ‚Üí 512
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 14 ‚Üí 7
        )

        # After all pooling: 512 √ó 7 √ó 7 = 25,088

        # Shared feature layer
        self.feature_fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(512 * 7 * 7, 1024),
            nn.ReLU(inplace=True)
        )

        # Classification head
        self.class_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_classes)
        )

        # Attribute head
        self.attr_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_attributes)
        )

    def forward(self, x):
        # Extract features from images ONLY
        x = self.features(x)
        x = torch.flatten(x, 1)
        features = self.feature_fc(x)

        # Two predictions from same features
        class_logits = self.class_head(features)
        attr_logits = self.attr_head(features)

        return class_logits, attr_logits

In [31]:
from tqdm import tqdm

def train_one_epoch_multitask(model, loader, optimizer, criterion_class,
                              criterion_attr, full_attributes, device, alpha=0.7):
    """
    Train multi-task model for one epoch

    Args:
        model: Your CNNMultiTask or ImprovedCNNMultiTask model
        loader: Training DataLoader
        optimizer: Optimizer (e.g., Adam)
        criterion_class: Loss for classification (e.g., CrossEntropyLoss)
        criterion_attr: Loss for attributes (e.g., MSELoss)
        full_attributes: numpy array of shape (200, 312) - full attributes matrix
        device: torch.device (cuda or cpu)
        alpha: weight for classification loss (1-alpha for attribute loss)

    Returns:
        metrics: dict with 'total_loss', 'class_loss', 'attr_loss', 'accuracy'
    """
    model.train()
    attributes_gpu = torch.FloatTensor(full_attributes).to(device)

    # Initialize metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    # Create progress bar
    progress_bar = tqdm(loader, desc="Training", leave=False)

    for batch in progress_bar:
        # Handle both 2-value and 3-value dataset returns
        if len(batch) == 2:
            images, labels = batch
        else:  # len(batch) == 3
            images, labels, _ = batch  # Ignore third value if present
        # Check device BEFORE movin

        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)




        # Get ground truth attributes for this batch
        # labels is (batch_size,) with values [0, 199]
        # full_attributes[labels.cpu().numpy()] returns (batch_size, 312)
        batch_attrs = attributes_gpu[labels]

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        class_logits, attr_logits = model(images)

        # Calculate two losses
        loss_class = criterion_class(class_logits, labels)
        loss_attr = criterion_attr(attr_logits, batch_attrs)

        # Combined loss with weighting
        loss = alpha * loss_class + (1 - alpha) * loss_attr

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track metrics
        batch_size = images.size(0)
        total_loss += loss.item() * batch_size
        total_class_loss += loss_class.item() * batch_size
        total_attr_loss += loss_attr.item() * batch_size

        # Calculate accuracy
        _, preds = class_logits.max(1)
        class_correct += (preds == labels).sum().item()
        total += batch_size

        # Update progress bar with current metrics
        current_acc = class_correct / total
        progress_bar.set_postfix({
            'loss': f'{loss.item():.3f}',
            'acc': f'{current_acc:.3f}'
        })

    # Calculate average metrics over entire epoch
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total
    }

    return metrics

In [32]:
def evaluate_multitask(model, loader, criterion_class, criterion_attr,
                       full_attributes, device, alpha=0.7):
    """
    Evaluate multi-task model on validation/test set

    Args:
        Same as train_one_epoch_multitask

    Returns:
        metrics: dict with losses, accuracy, and predictions for analysis
    """
    model.eval()
    attributes_gpu = torch.FloatTensor(full_attributes).to(device)
    # Initialize metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    # Store predictions for confusion matrix later
    all_preds = []
    all_labels = []

    with torch.no_grad():  # No gradients needed for evaluation
        for batch in tqdm(loader, desc="Validation", leave=False):
            # Handle both 2-value and 3-value dataset returns
            if len(batch) == 2:
                images, labels = batch
            else:
                images, labels, _ = batch



            images = images.to(device)
            labels = labels.to(device)

            # Check device AFTER moving
            # Get ground truth attributes
            batch_attrs = attributes_gpu[labels]

            # Forward pass
            class_logits, attr_logits = model(images)

            # Calculate losses
            loss_class = criterion_class(class_logits, labels)
            loss_attr = criterion_attr(attr_logits, batch_attrs)
            loss = alpha * loss_class + (1 - alpha) * loss_attr

            # Track metrics
            batch_size = images.size(0)
            total_loss += loss.item() * batch_size
            total_class_loss += loss_class.item() * batch_size
            total_attr_loss += loss_attr.item() * batch_size

            # Calculate accuracy
            _, preds = class_logits.max(1)
            class_correct += (preds == labels).sum().item()
            total += batch_size

            # Store predictions and labels for later analysis
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate average metrics
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total,
        'predictions': all_preds,  # For confusion matrix
        'labels': all_labels        # For confusion matrix
    }

    return metrics

In [None]:

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
attributes_full = np.load(glob_path + 'data/attributes.npy')

# Use improved model
model = ImprovedCNNMultiTask(num_classes=200, num_attributes=312).to(device)

# Loss functions
criterion_class = nn.CrossEntropyLoss()
criterion_attr = nn.MSELoss()

# IMPORTANT: Lower learning rate for training from scratch
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)  # Much lower!

# Add learning rate scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=20, eta_min=1e-6)

# Training config
num_epochs = 25  # More epochs for from-scratch training
alpha = 0.8  # Focus more on classification
best_val_acc = 0.0

history = {
    'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [],
    'train_class_loss': [], 'train_attr_loss': [],
    'val_class_loss': [], 'val_attr_loss': []
}

print("="*70)
print(f"TRAINING IMPROVED CNN FROM SCRATCH")
print("="*70)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"Epochs: {num_epochs}")
print(f"Alpha (class weight): {alpha}")
print("="*70 + "\n")

for epoch in range(1, num_epochs + 1):
    print(f"\nEpoch {epoch}/{num_epochs}")

    train_metrics = train_one_epoch_multitask(
        model, train_loader, optimizer, criterion_class,
        criterion_attr, attributes_full, device, alpha
    )

    val_metrics = evaluate_multitask(
        model, val_loader, criterion_class, criterion_attr,
        attributes_full, device, alpha
    )

    # Update learning rate
    scheduler.step()

    # Store history
    history['train_loss'].append(train_metrics['total_loss'])
    history['train_acc'].append(train_metrics['accuracy'])
    history['val_loss'].append(val_metrics['total_loss'])
    history['val_acc'].append(val_metrics['accuracy'])
    history['train_class_loss'].append(train_metrics['class_loss'])
    history['train_attr_loss'].append(train_metrics['attr_loss'])
    history['val_class_loss'].append(val_metrics['class_loss'])
    history['val_attr_loss'].append(val_metrics['attr_loss'])

    print(f"Train: Loss={train_metrics['total_loss']:.4f}, Acc={train_metrics['accuracy']:.4f} ({train_metrics['accuracy']*100:.2f}%)")
    print(f"Val:   Loss={val_metrics['total_loss']:.4f}, Acc={val_metrics['accuracy']:.4f} ({val_metrics['accuracy']*100:.2f}%)")
    print(f"LR: {optimizer.param_groups[0]['lr']:.6f}")

    if val_metrics['accuracy'] > best_val_acc:
        best_val_acc = val_metrics['accuracy']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_metrics['accuracy'],
            'history': history
        }, "best_improved_cnn_multitask.pt")
        print(f"‚úÖ New best! Val Acc: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")

print(f"\nüéâ Training complete! Best Val Acc: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")

TRAINING IMPROVED CNN FROM SCRATCH
Model parameters: 30,917,120
Learning rate: 5e-05
Epochs: 25
Alpha (class weight): 0.8


Epoch 1/25


Training:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 23/50 [04:14<04:01,  8.93s/it, loss=4.315, acc=0.008]

In [None]:
CNN_model = SimpleCNN(num_classes=200).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
CNN_optimizer = torch.optim.Adam(CNN_model.parameters(), lr=3e-4)

### Define a baseline CNN model (ResNet18)

As a strong baseline, we use a pretrained `ResNet18` from `torchvision.models`:
- We load ImageNet-pretrained weights.
- We replace the final fully-connected layer so it outputs 200 logits (one per bird class).
- The rest of the network acts as a feature extractor.

This gives a solid starting point for accuracy without heavy custom architecture work.


In [None]:
# Load pretrained ResNet18
weights = models.ResNet18_Weights.IMAGENET1K_V1
ResNet_model = models.resnet18(weights=weights)

# Replace the final layer to match 200 classes
num_features = ResNet_model.fc.in_features
ResNet_model.fc = nn.Linear(num_features, 200)

ResNet_model = ResNet_model.to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
ResNet_optimizer = torch.optim.Adam(ResNet_model.parameters(), lr=1e-4)

### Define training and validation loops

Here we implement two functions:

- `train_one_epoch`: runs one epoch over the training set, updates weights, and tracks loss and accuracy.
- `evaluate`: runs one full pass over the validation set without gradient updates, and reports loss and accuracy.

These utilities keep the main training loop clean and readable, and allow easy reuse later.

In [None]:
from tqdm import tqdm

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for pixel_values, labels, attributes in tqdm(loader, desc="Train", leave=False):
        pixel_values = pixel_values.to(device)
        labels = labels.to(device)
        attributes = attributes.to(device)

        optimizer.zero_grad()

        logits = model(pixel_values, attributes)
        #outputs = model(imgs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * pixel_values.size(0)
        _, preds = logits.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for pixel_values, labels, attributes in tqdm(loader, desc="Val", leave=False):
            pixel_values = pixel_values.to(device)
            labels = labels.to(device)
            attributes = attributes.to(device)

            logits = model(pixel_values, attributes)
            loss = criterion(logits, labels)

            total_loss += loss.item() * pixel_values.size(0)
            _, preds = logits.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy


### Train the CNN baseline model and monitor accuracy

We run the training for a few epochs.  
For each epoch, we log:
- Training loss and accuracy
- Validation loss and accuracy

We also keep track of the best validation accuracy and save the model weights whenever a new best score is reached.  
This gives me a first baseline performance for the bird classification task.


In [None]:
attributes_full = np.load('data/attributes.npy')

In [None]:

import matplotlib.pyplot as plt

# ============================================
# TRAINING FUNCTIONS WITH DETAILED METRICS
# ============================================

def train_one_epoch_multitask(model, loader, optimizer, criterion_class,
                              criterion_attr, full_attributes, device, alpha=0.7):
    """Train for one epoch and return detailed metrics"""
    model.train()

    # Metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    progress_bar = tqdm(loader, desc="Training", leave=False)

    for batch in progress_bar:
        # Handle both 2-value and 3-value returns
        if len(batch) == 2:
            images, labels = batch
        else:  # len(batch) == 3
            images, labels, _ = batch
        images = images.to(device)
        labels = labels.to(device)

        # Get ground truth attributes
        batch_attrs = full_attributes[labels.cpu().numpy()]
        batch_attrs = torch.FloatTensor(batch_attrs).to(device)

        # Forward pass
        optimizer.zero_grad()
        class_logits, attr_logits = model(images)

        # Calculate losses
        loss_class = criterion_class(class_logits, labels)
        loss_attr = criterion_attr(attr_logits, batch_attrs)
        loss = alpha * loss_class + (1 - alpha) * loss_attr

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track metrics
        batch_size = images.size(0)
        total_loss += loss.item() * batch_size
        total_class_loss += loss_class.item() * batch_size
        total_attr_loss += loss_attr.item() * batch_size

        _, preds = class_logits.max(1)
        class_correct += (preds == labels).sum().item()
        total += batch_size

        # Update progress bar
        current_acc = class_correct / total
        progress_bar.set_postfix({
            'loss': f'{loss.item():.3f}',
            'acc': f'{current_acc:.3f}'
        })

    # Calculate average metrics
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total
    }

    return metrics


def evaluate_multitask(model, loader, criterion_class, criterion_attr,
                       full_attributes, device, alpha=0.7):
    """Evaluate model and return detailed metrics"""
    model.eval()

    # Metrics tracking
    total_loss = 0.0
    total_class_loss = 0.0
    total_attr_loss = 0.0
    class_correct = 0
    total = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        progress_bar = tqdm(loader, desc="Training", leave=False)

        for batch in progress_bar:
            # Handle both 2-value and 3-value returns
            if len(batch) == 2:
                images, labels = batch
            else:  # len(batch) == 3
                images, labels, _ = batch
            images = images.to(device)
            labels = labels.to(device)

            # Get ground truth attributes
            batch_attrs = full_attributes[labels.cpu().numpy()]
            batch_attrs = torch.FloatTensor(batch_attrs).to(device)

            # Forward pass
            class_logits, attr_logits = model(images)

            # Calculate losses
            loss_class = criterion_class(class_logits, labels)
            loss_attr = criterion_attr(attr_logits, batch_attrs)
            loss = alpha * loss_class + (1 - alpha) * loss_attr

            # Track metrics
            batch_size = images.size(0)
            total_loss += loss.item() * batch_size
            total_class_loss += loss_class.item() * batch_size
            total_attr_loss += loss_attr.item() * batch_size

            _, preds = class_logits.max(1)
            class_correct += (preds == labels).sum().item()
            total += batch_size

            # Store for confusion matrix later
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate average metrics
    metrics = {
        'total_loss': total_loss / total,
        'class_loss': total_class_loss / total,
        'attr_loss': total_attr_loss / total,
        'accuracy': class_correct / total,
        'predictions': all_preds,
        'labels': all_labels
    }

    return metrics


# ============================================
# TRAINING LOOP WITH FULL METRICS DISPLAY
# ============================================



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

# Load attributes
attributes_full = np.load('data/attributes.npy')

# Initialize model
model = CNNMultiTask(num_classes=200, num_attributes=312).to(device)

# Loss functions
criterion_class = nn.CrossEntropyLoss()
criterion_attr = nn.MSELoss()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training configuration
num_epochs = 15
alpha = 0.7  # Weight for classification loss
best_val_acc = 0.0

# Metrics history for plotting
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'train_class_loss': [],
    'train_attr_loss': [],
    'val_class_loss': [],
    'val_attr_loss': []
}

print("="*70)
print(f"{'TRAINING MULTI-TASK CNN':^70}")
print("="*70)
print(f"Model: CNNMultiTask")
print(f"Num classes: 200")
print(f"Num attributes: 312")
print(f"Loss weight (alpha): {alpha}")
print(f"Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"Batch size: {train_loader.batch_size}")
print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print("="*70 + "\n")

# Training loop
for epoch in range(1, num_epochs + 1):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch}/{num_epochs}")
    print(f"{'='*70}")

    # Train
    train_metrics = train_one_epoch_multitask(
        model, train_loader, optimizer, criterion_class,
        criterion_attr, attributes_full, device, alpha
    )

    # Validate
    val_metrics = evaluate_multitask(
        model, val_loader, criterion_class, criterion_attr,
        attributes_full, device, alpha
    )

    # Store history
    history['train_loss'].append(train_metrics['total_loss'])
    history['train_acc'].append(train_metrics['accuracy'])
    history['val_loss'].append(val_metrics['total_loss'])
    history['val_acc'].append(val_metrics['accuracy'])
    history['train_class_loss'].append(train_metrics['class_loss'])
    history['train_attr_loss'].append(train_metrics['attr_loss'])
    history['val_class_loss'].append(val_metrics['class_loss'])
    history['val_attr_loss'].append(val_metrics['attr_loss'])

    # Print metrics
    print(f"\nüìä TRAINING METRICS:")
    print(f"   Total Loss:    {train_metrics['total_loss']:.4f}")
    print(f"   Class Loss:    {train_metrics['class_loss']:.4f}")
    print(f"   Attr Loss:     {train_metrics['attr_loss']:.4f}")
    print(f"   Accuracy:      {train_metrics['accuracy']:.4f} ({train_metrics['accuracy']*100:.2f}%)")

    print(f"\nüìä VALIDATION METRICS:")
    print(f"   Total Loss:    {val_metrics['total_loss']:.4f}")
    print(f"   Class Loss:    {val_metrics['class_loss']:.4f}")
    print(f"   Attr Loss:     {val_metrics['attr_loss']:.4f}")
    print(f"   Accuracy:      {val_metrics['accuracy']:.4f} ({val_metrics['accuracy']*100:.2f}%)")

    # Check if best model
    if val_metrics['accuracy'] > best_val_acc:
        best_val_acc = val_metrics['accuracy']

        # Save checkpoint
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_metrics['accuracy'],
            'val_loss': val_metrics['total_loss'],
            'train_acc': train_metrics['accuracy'],
            'history': history
        }
        torch.save(checkpoint, "best_multitask_cnn.pt")

        print(f"\n‚úÖ NEW BEST MODEL SAVED!")
        print(f"   Validation Accuracy: {val_metrics['accuracy']:.4f} ({val_metrics['accuracy']*100:.2f}%)")
        print(f"   Improvement: +{(val_metrics['accuracy'] - (history['val_acc'][-2] if len(history['val_acc']) > 1 else 0))*100:.2f}%")
    else:
        print(f"\n   Best Val Acc: {best_val_acc:.4f} (Epoch {history['val_acc'].index(best_val_acc) + 1})")

print("\n" + "="*70)
print(f"{'TRAINING COMPLETE':^70}")
print("="*70)
print(f"\nüéâ Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")
print(f"üìÅ Model saved to: best_multitask_cnn.pt\n")

In [None]:
num_epochs = 5
best_val_acc = 0.0

model = CNNMultiTask(num_classes=200, num_attributes=312).to(device)

criterion_class = nn.CrossEntropyLoss()
criterion_attr = nn.BCEWithLogitsLoss()  # Or MSELoss if attributes are continuous
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(num_epochs):
    model.train()

    for images, labels, attributes in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Get ground truth attributes for these classes
        true_attributes = torch.FloatTensor(attributes_full[labels.cpu().numpy()]).to(device)

        optimizer.zero_grad()

        # Forward pass
        class_logits, attr_logits = model(images)

        # Two losses
        loss_class = criterion_class(class_logits, labels)
        loss_attr = criterion_attr(attr_logits, true_attributes)

        # Combined loss (you can weight these)
        loss = 0.7 * loss_class + 0.3 * loss_attr

        loss.backward()
        optimizer.step()

In [None]:
num_epochs = 5
best_val_acc = 0.0

for epoch in range(1, num_epochs + 1):
    print(f"Epoch {epoch}/{num_epochs}")

    train_loss, train_acc = train_one_epoch(CNN_model, train_loader, CNN_optimizer, criterion, device)
    val_loss, val_acc = evaluate(CNN_model, val_loader, criterion, device)

    print(f"  Train  | loss: {train_loss:.4f}, acc: {train_acc:.4f}")
    print(f"  Val    | loss: {val_loss:.4f}, acc: {val_acc:.4f}")

    # Save best ResNet_model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(CNN_model.state_dict(), "best_CNN_baseline.pt")
        print(f"New best model saved with val_acc = {best_val_acc:.4f}")


### Train the ResNet18 baseline model and monitor accuracy


In [None]:
num_epochs = 5
best_val_acc = 0.0

for epoch in range(1, num_epochs + 1):
    print(f"Epoch {epoch}/{num_epochs}")

    train_loss, train_acc = train_one_epoch(ResNet_model, train_loader, ResNet_optimizer, criterion, device)
    val_loss, val_acc = evaluate(ResNet_model, val_loader, criterion, device)

    print(f"  Train  | loss: {train_loss:.4f}, acc: {train_acc:.4f}")
    print(f"  Val    | loss: {val_loss:.4f}, acc: {val_acc:.4f}")

    # Save best ResNet_model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(ResNet_model.state_dict(), "best_resnet18_baseline.pt")
        print(f"New best model saved with val_acc = {best_val_acc:.4f}")


### Re-load the ResNet18 model (No need to run)

In [None]:
ResNet_model = models.resnet18(weights=None)  # initialize architecture
num_features = ResNet_model.fc.in_features
ResNet_model.fc = nn.Linear(num_features, 200)

ResNet_model.load_state_dict(torch.load("best_resnet18_baseline.pt", map_location=device))
ResNet_model = ResNet_model.to(device)
ResNet_model.eval()

## Load `Falconsai/nsfw_image_detection` and adapt it for 200 bird classes

The `Falconsai/nsfw_image_detection` model is a ViT-based image classifier originally trained for 2 classes
(`normal` vs `nsfw`). I reuse the pretrained backbone and:

1. Load the model and its image processor from Hugging Face.
2. Replace the final classification layer (`classifier`) so that it outputs 200 logits (one per bird class).
3. Update the config metadata (`num_labels`, `id2label`, `label2id`) for consistency.

This gives me a strong transformer-based model specialized for my 200 bird classes.


In [None]:
## !pip install transformers

from transformers import AutoModelForImageClassification, AutoImageProcessor
import torch
import torch.nn as nn


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model_name = "Falconsai/nsfw_image_detection"

# Image processor: handles resize, normalize, etc. for ViT
processor = AutoImageProcessor.from_pretrained(model_name)

# Load ViT-based image classification model
vit_model = AutoModelForImageClassification.from_pretrained(model_name)

print("Original num_labels:", vit_model.config.num_labels)

# Replace classifier head to output 200 classes
num_features = vit_model.classifier.in_features
vit_model.classifier = nn.Linear(num_features, 200)

# Update config info
vit_model.config.num_labels = 200
vit_model.num_labels = 200
vit_model.config.id2label = {i: f"class_{i+1}" for i in range(200)}
vit_model.config.label2id = {v: k for k, v in vit_model.config.id2label.items()}

vit_model = vit_model.to(device)
print("Adapted num_labels:", vit_model.config.num_labels)


### Create a Dataset that uses the ViT image processor

For the ViT model, I no longer use the `torchvision` transforms.
Instead, I use the Hugging Face `AutoImageProcessor`, which:

- Resizes the image to the correct resolution (224√ó224 for ViT)
- Converts it to a tensor
- Applies the exact normalization used during pretraining

I define a `BirdsDatasetViT` class that:
- Takes the same `train_df` / `val_df` as before (with `image_path` and `label`)
- Loads each image with PIL
- Runs the image through the processor to get `pixel_values`
- Returns `(pixel_values, label)` where labels are 0‚Äì199


In [None]:
from torch.utils.data import Dataset
from PIL import Image
import os

class BirdsDatasetViT(Dataset):
    def __init__(self, df, processor, base_dir=".", label_col="label", path_col="image_path"):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.base_dir = base_dir
        self.label_col = label_col
        self.path_col = path_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        raw_path = str(row[self.path_col])
        # Fix leading "/" ‚Üí make it relative
        rel_path = raw_path.lstrip("/")
        img_path = os.path.join(self.base_dir, rel_path)

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found: {img_path}")

        img = Image.open(img_path).convert("RGB")

        # Use HF processor to get ViT-ready pixel_values
        inputs = self.processor(images=img, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)  # (3, H, W)

        label = int(row[self.label_col]) - 1  # 1‚Äì200 ‚Üí 0‚Äì199

        return pixel_values, label


### Create DataLoaders for the ViT-based model

Now I wrap the `BirdsDatasetViT` in PyTorch DataLoaders.
On macOS, using `num_workers=0` avoids multiprocess issues while debugging.
These loaders will feed ViT-ready `pixel_values` and labels into the training loop.


In [None]:
from torch.utils.data import DataLoader

# Adjust this path: the folder that contains `train_images/`
# If your notebook is already in the project root, "." is fine.
base_dir = "."

batch_size = 32

train_dataset_vit = BirdsDatasetViT(train_images, processor=processor, base_dir=base_dir)
val_dataset_vit   = BirdsDatasetViT(val_images,   processor=processor, base_dir=base_dir)

train_loader_vit = DataLoader(
    train_dataset_vit,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

val_loader_vit = DataLoader(
    val_dataset_vit,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

### Training and validation loops for the ViT model

The training logic is the same as before, but the forward pass changes slightly:

- For ResNet: `outputs = model(images)`
- For ViT (Hugging Face): `outputs = vit_model(pixel_values=images)`

From `outputs`, I use `outputs.logits` and compute cross-entropy loss as usual.
The rest of the loop (accuracy computation, backprop, logging) is unchanged.


In [None]:
from tqdm import tqdm

def train_one_epoch_vit(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for pixel_values, labels in tqdm(loader, desc="Train (ViT)", leave=False):
        pixel_values = pixel_values.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * pixel_values.size(0)
        _, preds = logits.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy


def evaluate_vit(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for pixel_values, labels in tqdm(loader, desc="Val (ViT)", leave=False):
            pixel_values = pixel_values.to(device)
            labels = labels.to(device)

            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits

            loss = criterion(logits, labels)

            total_loss += loss.item() * pixel_values.size(0)
            _, preds = logits.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy


### Fine-tune the Falconsai ViT as a "ceiling" model

Now I fine-tune the adapted ViT model on the bird dataset.
This model:

- Starts from a powerful Vision Transformer backbone
- Has a new classification head for 200 bird classes
- Is expected to perform at least as well as ResNet18, and possibly better,
  giving me an approximate performance "ceiling" for this assignment.

I reuse the same training hyperparameters as a starting point and monitor train/validation accuracy.


In [None]:
criterion_vit = nn.CrossEntropyLoss()
optimizer_vit = torch.optim.Adam(vit_model.parameters(), lr=1e-4)

num_epochs_vit = 5
best_val_acc_vit = 0.0

for epoch in range(1, num_epochs_vit + 1):
    print(f"Epoch {epoch}/{num_epochs_vit} (ViT)")

    train_loss, train_acc = train_one_epoch_vit(vit_model, train_loader_vit, optimizer_vit, criterion_vit, device)
    val_loss, val_acc = evaluate_vit(vit_model, val_loader_vit, criterion_vit, device)

    print(f"  Train (ViT) | loss: {train_loss:.4f}, acc: {train_acc:.4f}")
    print(f"  Val   (ViT) | loss: {val_loss:.4f}, acc: {val_acc:.4f}")

    if val_acc > best_val_acc_vit:
        best_val_acc_vit = val_acc
        torch.save(vit_model.state_dict(), "vit_nsfw_birds_state_dict.pt")
        print(f" New best ViT model saved with val_acc = {best_val_acc_vit:.4f}")


In [None]:
class BirdsTestDataset(Dataset):
    """
    Dataset for test images (no labels, no attributes needed)
    """
    def __init__(self, df, transform=None, processor=None, path_col="image_path"):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.processor = processor
        self.path_col = path_col

        if transform is None and processor is None:
            raise ValueError("Must provide either transform or processor")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row[self.path_col]

        img = Image.open(img_path).convert("RGB")

        # Use processor OR transform
        if self.processor is not None:
            inputs = self.processor(images=img, return_tensors="pt")
            pixel_values = inputs["pixel_values"].squeeze(0)
        else:
            pixel_values = self.transform(img)

        return pixel_values

In [None]:
test_df = pd.read_csv("data/test_images_sample.csv")  # Should have columns: 'id', 'image_path'
test_path = pd.read_csv("data/test_images_path.csv")
test_path

In [None]:

import os
# Fix paths (adjust based on your file structure)
test_path["image_path"] = test_path["image_path"].apply(
    lambda x: os.path.join("data", x.lstrip("/"))
)

# Verify first path exists
print(f"First test image: {test_path.iloc[0]['image_path']}")
print(f"Exists? {os.path.exists(test_path.iloc[0]['image_path'])}")

print(f"\nTest set size: {len(test_path)}")
test_path.head()

In [None]:
def generate_submission_cnn_with_attrs(model, test_df, transform, device,
                                       output_file="submission.csv"):
    """
    Generate Kaggle submission for CNN model that uses attributes

    Args:
        model: Your CNNWithAttributes model
        test_df: DataFrame with test images
        attributes: (200, 312) numpy array
        transform: torchvision transforms
        device: torch.device
        output_file: name of output CSV file
    """
    # Create test dataset and loader
    test_dataset = BirdsTestDataset(test_df, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Load best model checkpoint
    model.to(device)
    model.eval()

    all_predictions = []

    print("Generating predictions...")
    with torch.no_grad():
        for pixel_values in tqdm(test_loader):
            pixel_values = pixel_values.to(device)
            batch_size = pixel_values.size(0)

            # For test images, we don't know the true class, so we need to
            # try predicting WITHOUT attributes, OR use a dummy approach

            # OPTION 1: Predict without attributes (if model can handle it)
            # This won't work with CNNWithAttributes as-is

            # OPTION 2: Use all-zeros attributes (dummy)
            dummy_attrs = torch.zeros(batch_size, 312).to(device)
            logits = model(pixel_values, dummy_attrs)

            # Get predicted classes
            preds = torch.argmax(logits, dim=1)

            # Convert 0-indexed to 1-indexed (1-200)
            preds = preds.cpu().numpy() + 1
            all_predictions.extend(preds)

    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'label': all_predictions
    })

    # Save to CSV
    submission_df.to_csv(output_file, index=False)
    print(f"‚úÖ Saved predictions to {output_file}")

    return submission_df

In [None]:
cnn_model = SimpleCNN(num_classes=200).to(device)

submission_cnn = generate_submission_cnn_with_attrs(
    model=cnn_model,
    test_df=test_path,
    transform=val_transform,
    device=device,
    #checkpoint_path="best_CNN_baseline.pt",
    output_file="submission_cnn_baseline.csv"
)

In [None]:
predictions_final = pd.read_csv('submission_cnn_baseline.csv')

In [None]:
predictions_final