In [2]:
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

import torchvision
import torchvision.transforms as transforms

from torchvision.models import ResNet101_Weights

In [None]:
print(torch.__version__)
print(torchvision.__version__)

# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

print(torch.get_default_device())

!nvcc --version


In [11]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

This code applies three transformations to the input image: resizing it to 128x128 pixels, converting it into a tensor, and normalizing the pixel values according to pre-defined mean and standard deviation values for ResNet models.


In [4]:
# Define transformations (resize, convert to tensor, and normalize)
transform = transforms.Compose([
    transforms.Resize(256),           # Resize to 256x256 first
    transforms.CenterCrop(224),       # Crop to 224x224
    transforms.ToTensor(),            # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])


- Loads the images and applies the specified transformations.
- random_split divides the dataset into training (80%) and testing (20%) subsets.
- DataLoader creates iterators for loading batches of data from the training and test datasets with specified batch sizes, shuffling for training, and parallel data loading.

In [5]:
generator = torch.Generator(device='cuda')

#  load images from /content/images/baseset_001 ,  /content/images/baseset_002,  /content/images/baseset_003 etc...
train_data = torchvision.datasets.ImageFolder(root='images', transform=transform)

# Split dataset into train and test sets (80% train, 20% test)
train_size = int(0.8 * len(train_data))  # 80% for training
test_size = len(train_data) - train_size  # The rest for testing

# split the dataset
train_dataset, test_dataset = random_split(train_data, [train_size, test_size], generator=generator)

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, generator=generator)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, generator=generator)


Let's check out the shape of the data!

In [None]:
image, label = train_data[0]
image.shape, label

Let's predefine the labels.

In [None]:
# class_names = ['base1-01', 'base1-02', 'base1-03'] etc.
class_names = train_data.classes
print(f"{len(class_names)} classes")

Set device to the current device we are using cude (gpu) or cpu

- A pre-trained ResNet-50 model is loaded with weights from ImageNet.
- The final fully connected layer (model.fc) is replaced with a new Linear layer that has as many output units as there are classes in your dataset (len(class_names)).
- The model is moved to the device (GPU or CPU).


In [None]:
model = torchvision.models.resnet101(weights=ResNet101_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, len(class_names))

# compile the model (optimized kernals)
compiled_model = torch.compile(model, mode="reduce-overhead")


**nn.CrossEntropyLoss():** This is a commonly used loss function for multi-class classification problems. It combines softmax activation and negative log-likelihood loss in a single function, making it efficient for classification tasks.

**optim.SGD**: This initializes the Stochastic Gradient Descent (SGD) optimizer for training the model. SGD is one of the most common optimization algorithms, which iterates through the training dataset and updates the model's parameters (weights) to minimize the loss.

In [9]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.3, cooldown=0)

In [None]:
correct = 0
total = 0

for epoch in range(30):
    model.train()
    print(f"Epoch {epoch+1}")
    running_loss = 0.0

    for i, data in enumerate(train_loader):
        images, labels = data
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = compiled_model(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            
            outputs = compiled_model(images)
            loss = loss_function(outputs, labels)
            val_loss += loss.item()

            predicted = outputs.argmax(dim=1) 
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    avg_val_loss = val_loss / len(test_loader)
    accuracy = 100 * correct / total

    print(f"Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%")

    # Step the scheduler with validation loss
    scheduler.step(avg_val_loss)

In [None]:
# Set the model to evaluation mode
model.eval()

# Disable gradient computation for inference
with torch.no_grad():
    # Compute total number of correct predictions and total samples
    correct = 0
    total = 0
    
    for images, labels in test_loader:
        # Move data to the same device as the model
        images = images.to(device)
        labels = labels.to(device)
        
        # Get model predictions
        outputs = model(images)
        
        # Get the predicted class (most confident prediction)
        predictions = outputs.argmax(dim=1)
        
        # Update total samples and correct predictions
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

# Calculate and print accuracy
accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

In [None]:
import os

def load_and_preprocess_image(image_path):
    """
    Load an image, apply transformations, and prepare it for model inference.
    
    Args:
        image_path (str): Path to the image file
    
    Returns:
        torch.Tensor: Preprocessed image tensor ready for model input
    """
    # Open the image using PIL
    image = Image.open(image_path)
    
    # Apply transformations and prepare for model input
    # .unsqueeze(0) adds a batch dimension, as models expect batched input
    processed_image = transform(image).unsqueeze(0).to(device)
    
    return processed_image

# List of image paths to predict
image_paths = [
    './test_images/alakazam.jpg', 
    './test_images/abra.jpg', 
    './test_images/zapdos.jpg',
    './test_images/fossil.webp'
]

# Preprocess all images
images = [load_and_preprocess_image(path) for path in image_paths]

# Set model to evaluation mode
model.eval()

# Disable gradient computation for inference
with torch.no_grad():
    for path, image in zip(image_paths, images):
        # Run inference
        output = model(image)
        
        # Get the most confident prediction
        predicted_index = output.argmax(dim=1).item()
        predicted_class = class_names[predicted_index]
        
        # Print prediction with original image path for context
        print(f"Image: {os.path.basename(path)}")
        print(f"Predicted class: {predicted_class}\n")
        print()

In [None]:
# This saves model weights AND class names:
checkpoint = {
    'model_state_dict': model.state_dict(),
    'class_names': train_data.classes
}

torch.save(checkpoint, 'pokemon_model.pth')