# 1. Preprocessing Dataset

Overfitting: Monitor train vs validation gap; apply dropout/augmentation

Computational efficiency: Start with smaller images/models, scale up if needed

Reproducibility: Set random seeds for consistent results


Must Have 

✅ Architecture details (layers, filters, activations)

✅ Final hyperparameters (LR, epochs, batch size, optimizer, loss)

✅ Training/Validation accuracy plot

✅ Test accuracy

✅ Precision, Recall, F1 per class

✅ Confusion matrix

✅ Brief analysis/observations


### **B. Data Transformations**:
- **For training**: Augmentation (random rotations, flips, color jitter) + normalize
- **For validation/test**: Just resize + normalize (no augmentation)

### **C. DataLoaders**:
- Train loader with shuffling
- Test/validation loader without shuffling

In [25]:
import os
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import v2
from PIL import Image
import random
import numpy as np
import torch.nn.functional as F


def set_seed(seed=433):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(433)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Set matplotlib to display plots inline
%matplotlib inline

# For better plot quality
plt.rcParams['figure.dpi'] = 100

PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [26]:
fingers_train_dir = 'data/fingers/train'
fingers_test_dir = 'data/fingers/test'

print(f"Train directory exists: {os.path.exists(fingers_train_dir)}")
print(f"Test directory exists: {os.path.exists(fingers_test_dir)}")

Train directory exists: True
Test directory exists: True


In [27]:
fingers_train_files = os.listdir(fingers_train_dir)
fingers_test_files = os.listdir(fingers_test_dir)

print(f"Total training images: {len(fingers_train_files)}")
print(f"Total test images: {len(fingers_test_files)}")
print(f"\n{'='*50}")
print(f"First 15 training filenames:")
print(f"{'='*50}")
for i, filename in enumerate(fingers_train_files[:15]):
    print(f"{i+1:2d}. {filename}")

Total training images: 18000
Total test images: 3600

First 15 training filenames:
 1. df16050b-6ce3-4a09-8406-f314c9303090_3R.png
 2. ae08efc1-b453-49f3-80f9-d4b3dacd83e0_5R.png
 3. 401d2efa-d6fd-4116-878c-38f4806d8b97_0L.png
 4. 69a97cd5-f3b5-4246-90cc-14173995ffea_2R.png
 5. f510e888-1eec-43c4-9e16-a591e68e47f9_4R.png
 6. c98821ba-4fcc-440b-b74f-87d34b526efa_4L.png
 7. 76de966c-103d-4879-85b4-6bd1c85f968c_1R.png
 8. fa50c7b1-4bf8-412d-a979-b6cc75029b65_5L.png
 9. 608e588f-ca5a-4e1a-82ba-bed244920c16_3R.png
10. 8fdd76c0-5615-4e87-b88c-496fcc20840d_1R.png
11. 38f0a970-957f-42b4-b5d9-7f1e1d8a3fa5_0R.png
12. 9640b47c-c37f-45ea-925c-21f380662adb_1R.png
13. a64a2165-6442-4430-b4db-8b58f384e471_4R.png
14. 6615d35f-14c1-4909-b069-6d6f936a7ce2_4R.png
15. 572ff08a-f791-4d7b-8a97-1a169b98b2e7_5R.png


### We are going to ommit the 'R' or 'L' completly since the assignment only tlks about categorising 3,4,5 and didn't mention right or left.

In [28]:
def extract_label(filename):
    try:
        name = filename.split('.')[0]
        label_part = name.split('_')[-1]
        digit = label_part[0]
        return int(digit)
    except:
        print(f"Warning: Could not parse {filename}")
        return None
    
print(f"{'Filename':<40} -> Label")
print("="*50)
for filename in fingers_train_files[:5]:
    label = extract_label(filename)
    print(f"{filename:<40} -> {label}")

Filename                                 -> Label
df16050b-6ce3-4a09-8406-f314c9303090_3R.png -> 3
ae08efc1-b453-49f3-80f9-d4b3dacd83e0_5R.png -> 5
401d2efa-d6fd-4116-878c-38f4806d8b97_0L.png -> 0
69a97cd5-f3b5-4246-90cc-14173995ffea_2R.png -> 2
f510e888-1eec-43c4-9e16-a591e68e47f9_4R.png -> 4


### Checking for Class imbalances

In [29]:
train_labels = []
for f in fingers_train_files:
    if f.endswith(('.jpg', '.png', '.jpeg')):
        label = extract_label(f)
        if label is not None:
            train_labels.append(label)

label_counts = Counter(train_labels)

print(f"\n{'='*50}")
print(f"Class Distribution in Training Set:")
print(f"{'='*50}")
for class_num in sorted(label_counts.keys()):
    print(f"Class {class_num}: {label_counts[class_num]:4d} images")
print(f"{'='*50}")
print(f"Total: {sum(label_counts.values())} images")


Class Distribution in Training Set:
Class 0: 3000 images
Class 1: 3000 images
Class 2: 3000 images
Class 3: 3000 images
Class 4: 3000 images
Class 5: 3000 images
Total: 18000 images


In [30]:
print("Checking image dimensions from random samples...\n")
print(f"{'Filename':<45} {'Width':<8} {'Height':<8} {'Mode':<8}")
print("="*75)

sample_files = random.sample(fingers_train_files, min(10, len(fingers_train_files)))

widths = []
heights = []

for filename in sample_files:
    if filename.endswith(('.jpg', '.png', '.jpeg')):
        img_path = os.path.join(fingers_train_dir, filename)
        img = Image.open(img_path)
        width, height = img.size
        widths.append(width)
        heights.append(height)
        print(f"{filename:<45} {width:<8} {height:<8} {img.mode:<8}")

print("="*75)
print(f"\nImage Statistics:")
print(f"  Most common width:  {max(set(widths), key=widths.count)}")
print(f"  Most common height: {max(set(heights), key=heights.count)}")
print(f"  Width range: {min(widths)} - {max(widths)}")
print(f"  Height range: {min(heights)} - {max(heights)}")

if len(set(widths)) == 1 and len(set(heights)) == 1:
    print(f"\nAll images have the SAME dimensions: {widths[0]}x{heights[0]}")
else:
    print(f"\nImages have DIFFERENT dimensions - we'll need to resize them")

Checking image dimensions from random samples...

Filename                                      Width    Height   Mode    
584e7e1a-8a38-48b8-8d00-a67734d57d3d_3L.png   128      128      L       
b1da3784-b409-4f5a-ae32-5e3a63ba52d9_3R.png   128      128      L       
20e6622e-c11c-4204-a1e9-140ed481d2b8_4R.png   128      128      L       
4f3a51f6-543b-4712-ad8a-aa5026a7d8bf_5R.png   128      128      L       
4e2b0ae3-e55a-4168-9c03-ef09bc3402dc_0R.png   128      128      L       
65de1b21-a7be-4a2b-b31d-e7eeeca6de77_4R.png   128      128      L       
f11052dd-efe8-441a-8d81-2a81e64ab281_0L.png   128      128      L       
5951b638-5654-45ce-a10a-c07c0d46f142_1L.png   128      128      L       
369b4cce-0e4a-4b06-975c-5ef1919fcc70_3R.png   128      128      L       
d592eb1d-0195-4354-b07b-4290d921cb6f_2L.png   128      128      L       

Image Statistics:
  Most common width:  128
  Most common height: 128
  Width range: 128 - 128
  Height range: 128 - 128

All images have the SAME

In [31]:
class FingerCountDataset(Dataset):
    
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        
        self.image_files = [f for f in os.listdir(data_dir) 
                           if f.endswith(('.jpg', '.png', '.jpeg'))]
        
        self.labels = [self._extract_label(f) for f in self.image_files]
        
        self.classes = sorted(list(set(self.labels)))
        self.num_classes = len(self.classes)
        
        print(f" Loaded {len(self.image_files)} images from {data_dir}")
        print(f"   Number of classes: {self.num_classes} (classes: {self.classes})")
        
    def _extract_label(self, filename):
        name = filename.split('.')[0]
        label_part = name.split('_')[-1]
        digit = int(label_part[0])
        return digit
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.data_dir, img_name)
        image = Image.open(img_path).convert('RGB')  # NEED TO Convert grayscale to RGB
        
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

In [32]:
# Normalization values (ImageNet standard)
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

# Training transformations (WITH augmentation) - NO RESIZE NEEDED
train_transform = transforms.Compose([
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

# Test transformations (NO augmentation) - NO RESIZE NEEDED
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

print("Transformations defined")
print("   Training: Augmentation + Normalization")
print("   Test: Normalization only")

Transformations defined
   Training: Augmentation + Normalization
   Test: Normalization only


In [33]:
# Create datasets
train_dataset = FingerCountDataset(
    data_dir=fingers_train_dir,
    transform=train_transform
)

test_dataset = FingerCountDataset(
    data_dir=fingers_test_dir,
    transform=test_transform
)

# Hyperparameters, this is current when developping the model we will change it and try diffrent stuff
BATCH_SIZE = 64
NUM_WORKERS = 4

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print(f"\n{'='*60}")
print(f"Dataset Summary:")
print(f"{'='*60}")
print(f"Training samples:   {len(train_dataset)}")
print(f"Test samples:       {len(test_dataset)}")
print(f"Training batches:   {len(train_loader)}")
print(f"Test batches:       {len(test_loader)}")
print(f"Batch size:         {BATCH_SIZE}")
print(f"{'='*60}")

 Loaded 18000 images from data/fingers/train
   Number of classes: 6 (classes: [0, 1, 2, 3, 4, 5])
 Loaded 3600 images from data/fingers/test
   Number of classes: 6 (classes: [0, 1, 2, 3, 4, 5])

Dataset Summary:
Training samples:   18000
Test samples:       3600
Training batches:   282
Test batches:       57
Batch size:         64


### Now after all of this let's verfiy the Preprocessing Pipeline 

In [34]:
images, labels = next(iter(train_loader))

print(f"\n{'='*60}")
print(f" Pipeline Verification:")
print(f"{'='*60}")
print(f"Batch shape:      {images.shape}")  # Should be [64, 3, 128, 128]
print(f"Labels shape:     {labels.shape}")  # Should be [64]
print(f"Image data type:  {images.dtype}")
print(f"Image range:      [{images.min():.3f}, {images.max():.3f}]")
print(f"Sample labels:    {labels[:10].tolist()}")
print(f"{'='*60}")
print(f"\n PREPROCESSING COMPLETE AND VERIFIED!")
print(f"{'='*60}")


 Pipeline Verification:
Batch shape:      torch.Size([64, 3, 128, 128])
Labels shape:     torch.Size([64])
Image data type:  torch.float32
Image range:      [-2.118, 2.640]
Sample labels:    [2, 4, 5, 1, 4, 5, 5, 2, 2, 5]

 PREPROCESSING COMPLETE AND VERIFIED!


## 2. CNN architecture

In [35]:
# Architecture 1: Simple CNN (Baseline)
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=6, dropout_rate=0.5):
        super(SimpleCNN, self).__init__()
        
        # Conv Block 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)  # 128 -> 64
        
        # Conv Block 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)  # 64 -> 32
        
        # Conv Block 3
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)  # 32 -> 16
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 16 * 16, 256)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(256, num_classes)
        
    def forward(self, x):
        # Conv blocks
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # FC layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x


# Architecture 2: Deeper CNN
class DeepCNN(nn.Module):
    def __init__(self, num_classes=6, dropout_rate=0.5):
        super(DeepCNN, self).__init__()
        
        # Conv Block 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        
        # Conv Block 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)  # 128 -> 64
        
        # Conv Block 3
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)  # 64 -> 32
        
        # Conv Block 4
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(2, 2)  # 32 -> 16
        
        # FC layers
        self.fc1 = nn.Linear(256 * 16 * 16, 512)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(512, 128)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        
        x = x.view(x.size(0), -1)
        
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        
        return x


# Architecture 3: Wide CNN (More filters per layer)
class WideCNN(nn.Module):
    def __init__(self, num_classes=6, dropout_rate=0.5):
        super(WideCNN, self).__init__()
        
        # Conv Block 1
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(2, 2)  # 128 -> 64
        
        # Conv Block 2
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.pool2 = nn.MaxPool2d(2, 2)  # 64 -> 32
        
        # Conv Block 3
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.pool3 = nn.MaxPool2d(2, 2)  # 32 -> 16
        
        # FC layers
        self.fc1 = nn.Linear(256 * 16 * 16, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(512, num_classes)
        
    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        
        x = x.view(x.size(0), -1)
        
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x


print(" Three CNN architectures defined:")
print("   1. SimpleCNN - 3 conv blocks, baseline")
print("   2. DeepCNN - 4 conv blocks, deeper network")
print("   3. WideCNN - 3 conv blocks, more filters")

 Three CNN architectures defined:
   1. SimpleCNN - 3 conv blocks, baseline
   2. DeepCNN - 4 conv blocks, deeper network
   3. WideCNN - 3 conv blocks, more filters


In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model1 = SimpleCNN().to(device)
model2 = DeepCNN().to(device)
model3 = WideCNN().to(device)

## 3. Training Loop

In [37]:
sample_images, sample_labels = next(iter(train_loader))
sample_images = sample_images.to(device)

print(f"Testing forward pass with sample batch:")
print(f"Input shape: {sample_images.shape}")

with torch.no_grad():
    output1 = model1(sample_images)
    output2 = model2(sample_images)
    output3 = model3(sample_images)

print(f"\n All models working correctly!")
print(f"SimpleCNN output shape: {output1.shape}")
print(f"DeepCNN output shape:   {output2.shape}")
print(f"WideCNN output shape:   {output3.shape}")
print(f"\n Architectures ready for training!")

Testing forward pass with sample batch:
Input shape: torch.Size([64, 3, 128, 128])

 All models working correctly!
SimpleCNN output shape: torch.Size([64, 6])
DeepCNN output shape:   torch.Size([64, 6])
WideCNN output shape:   torch.Size([64, 6])

 Architectures ready for training!


## 4. Evaluation