<a href="https://colab.research.google.com/github/bibhashthapa7/CSCI-335-ML/blob/main/ML_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bird Species Fine Tuning Project
Author: Bibhash Thapa

# Step 1: Data preview

In [1]:
# Make sure "public_tests.zip" folder is uploaded to the files section !!

!unzip public_tests.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: 00_test_img_input/test/images/0672.jpg  
  inflating: 00_test_img_input/test/images/2085.jpg  
  inflating: 00_test_img_input/test/images/0106.jpg  
  inflating: 00_test_img_input/test/images/1056.jpg  
  inflating: 00_test_img_input/test/images/1159.jpg  
  inflating: 00_test_img_input/test/images/0256.jpg  
  inflating: 00_test_img_input/test/images/2269.jpg  
  inflating: 00_test_img_input/test/images/0664.jpg  
  inflating: 00_test_img_input/test/images/1981.jpg  
  inflating: 00_test_img_input/test/images/0929.jpg  
  inflating: 00_test_img_input/test/images/1888.jpg  
  inflating: 00_test_img_input/test/images/0957.jpg  
  inflating: 00_test_img_input/test/images/0389.jpg  
  inflating: 00_test_img_input/test/images/0323.jpg  
  inflating: 00_test_img_input/test/images/0717.jpg  
  inflating: 00_test_img_input/test/images/0186.jpg  
  inflating: 00_test_img_input/test/images/0468.jpg  
  inflating: 00_t

In [1]:
from pathlib import Path
import pandas as pd

csv_file = "00_test_img_input/train/gt.csv"
csv_path = Path(csv_file)
df = pd.read_csv(csv_path)

print(f"Total images: {len(df)}\n")
print("First 5 records:")
display(df.head())

print("\nClass distribution:")
display(df['class_id'].value_counts().sort_index())

Total images: 2500

First 5 records:


Unnamed: 0,filename,class_id
0,0000.jpg,0
1,0001.jpg,0
2,0002.jpg,0
3,0003.jpg,0
4,0004.jpg,0



Class distribution:


Unnamed: 0_level_0,count
class_id,Unnamed: 1_level_1
0,50
1,50
2,50
3,50
4,50
5,50
6,50
7,50
8,50
9,50


# Step 2: Dataset and DataLoaders

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image

img_dir = "00_test_img_input/train/images"

# Define hyper-parameters
batch_size = 32
val_ratio = 0.2
num_workers = 8

# ImageNet-style normalization
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

In [3]:
# Define transforms
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean,std),
    transforms.RandomErasing(p=0.1, scale=(0.02,0.25))
])
val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean,std)
])


In [4]:
# Custom Dataset
class BirdDataset(Dataset):
    def __init__(self, csv_path, image_folder, transform=None):
        self.df = pd.read_csv(csv_path)
        self.folder = Path(image_folder)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.folder / row["filename"]
        img = Image.open(img_path).convert("RGB")
        label = int(row["class_id"])
        if self.transform:
            img = self.transform(img)
        return img, label

In [5]:
# Instantiate and split
full_ds = BirdDataset(csv_file, img_dir, transform=train_transform)
val_size = int(val_ratio * len(full_ds))
train_size = len(full_ds) - val_size
train_ds, val_ds = random_split(full_ds, [train_size, val_size])

# Override the validation transform
val_ds.dataset.transform = val_transform

In [6]:
# Create DataLoaders
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

# Verify that it works
imgs, lbls = next(iter(train_loader))
print(f"Train batch: images {imgs.shape}, labels {lbls.shape}")



Train batch: images torch.Size([32, 3, 224, 224]), labels torch.Size([32])


# Step 3: Model and Optimizer

In [7]:
import torch.nn as nn
import torch.optim as optim
from torchvision import models

# Compute number of target classes from the CSV
num_classes = df['class_id'].nunique()
print(f"Number of classes = {num_classes}")

# Load MobileNetV2 (pretrained on ImageNet)
model = models.mobilenet_v2(pretrained=True)

# Freeze all features, then unfreeze last few blocks
for param in model.features.parameters():
    param.requires_grad = False
for param in model.features[14:].parameters():
    param.requires_grad = True

# Replace pretrained head with a custom two‐layer classifier for better regularization
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.Linear(model.last_channel, 512),
    nn.BatchNorm1d(512),
    nn.ReLU(inplace=True),
    nn.Dropout(0.5),
    nn.Linear(512, num_classes)
)

# Move model to GPU if available for faster training, otherwise run on CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# AdamW + weight decay
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                        lr=1e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

# Cosine LR scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15)

print(model)

Number of classes = 50
MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1



# Step 4: Training and Validation

In [10]:
import time
import copy

best_acc = 0.0
best_wts = copy.deepcopy(model.state_dict())

for epoch in range(1, 16):
    start = time.time()

    # Train
    model.train()
    running_loss = correct = total = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):
            outputs = model(imgs)
            loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()*imgs.size(0)
        preds = outputs.argmax(1)
        correct += (preds==labels).sum().item()
        total += labels.size(0)
    train_loss, train_acc = running_loss/total, correct/total

    # Validation
    model.eval()
    running_loss = correct = total = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()*imgs.size(0)
            preds = outputs.argmax(1)
            correct += (preds==labels).sum().item()
            total += labels.size(0)
    val_loss, val_acc = running_loss/total, correct/total

    # Step scheduler
    scheduler.step()

    print(f"Epoch {epoch:2d} | "
          f"Train Loss {train_loss:.4f}, Accuracy {train_acc:.3f} | "
          f"Validation   Loss {val_loss:.4f}, Accuracy {val_acc:.3f} | "
          f"Time {(time.time()-start):.1f}s")

    if val_acc > best_acc:
        best_acc = val_acc
        best_wts = copy.deepcopy(model.state_dict())
        torch.save(best_wts, "birds_model.pt")
    if best_acc >= 0.85:
        print(f"\nReached validation accuracy {best_acc:.3f} ≥ 0.85 — stopping early.")
        break

print(f"\nTraining complete. Best validation accuracy: {best_acc:.3f}")

  with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):


Epoch  1 | Train Loss 1.6637, Accuracy 0.702 | Validation   Loss 1.5436, Accuracy 0.710 | Time 14.5s
Epoch  2 | Train Loss 1.3154, Accuracy 0.801 | Validation   Loss 1.3405, Accuracy 0.746 | Time 14.7s
Epoch  3 | Train Loss 1.0726, Accuracy 0.844 | Validation   Loss 1.1905, Accuracy 0.772 | Time 14.6s
Epoch  4 | Train Loss 0.8577, Accuracy 0.884 | Validation   Loss 1.1067, Accuracy 0.768 | Time 16.1s
Epoch  5 | Train Loss 0.7123, Accuracy 0.922 | Validation   Loss 1.0303, Accuracy 0.782 | Time 14.3s
Epoch  6 | Train Loss 0.5986, Accuracy 0.940 | Validation   Loss 0.9837, Accuracy 0.790 | Time 14.6s
Epoch  7 | Train Loss 0.5127, Accuracy 0.952 | Validation   Loss 0.9534, Accuracy 0.792 | Time 14.4s
Epoch  8 | Train Loss 0.4365, Accuracy 0.967 | Validation   Loss 0.9266, Accuracy 0.790 | Time 14.5s
Epoch  9 | Train Loss 0.3988, Accuracy 0.971 | Validation   Loss 0.9215, Accuracy 0.794 | Time 14.4s
Epoch 10 | Train Loss 0.3644, Accuracy 0.979 | Validation   Loss 0.9047, Accuracy 0.788 | T

# Step 5: Evaluate

In [11]:
model.load_state_dict(torch.load("birds_model.pt"))
model.eval()

MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

In [12]:
def topk_accuracy(output, target, topk=(1,5)):
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append((correct_k / batch_size).item())
    return res

# Run through val_loader once and aggregate top-1/top-5
all_top1, all_top5 = 0.0, 0.0
total = 0
with torch.no_grad():
    for imgs, labels in val_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        top1, top5 = topk_accuracy(outputs, labels, topk=(1,5))
        batch = labels.size(0)
        all_top1 += top1 * batch
        all_top5 += top5 * batch
        total += batch
print(f"Top-1 acc: {all_top1/total:.3f}, Top-5 acc: {all_top5/total:.3f}")


Top-1 acc: 0.798, Top-5 acc: 0.970
