Reference: https://docs.pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

Personal Notes
- https://docs.pytorch.org/docs/stable/tensorboard.html
- https://docs.pytorch.org/vision/main/models/convnext.html
- https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py

In [None]:
import os
import sys

import numpy as np
import torch
import torchvision
from tqdm.notebook import tqdm
import time
from torch.utils.tensorboard import SummaryWriter

# set seed to make notebook reproducable
np.random.seed(42)
torch.cuda.manual_seed_all(42)
torch.manual_seed(42)
generator = torch.Generator().manual_seed(42)
path = 'results/'
best_model_path = os.path.join(path, 'best_model.pt')

##### Hyperparameters

In [2]:
BATCH_SIZE = 32
NUM_EPOCHS = 5
LEARNING_RATE = 1e-3

##### Load Data
Using this dataset: https://www.kaggle.com/datasets/echometerhhwl/pokemon-gen-1-38914

Other datasets we found:
- https://www.kaggle.com/datasets/thedagger/pokemon-generation-one
- https://www.kaggle.com/datasets/lantian773030/pokemonclassification
- https://www.kaggle.com/datasets/kvpratama/pokemon-images-dataset
- https://www.kaggle.com/datasets/adamstanley537/pokemon-gen-1-image-dataset-27000-images

In [None]:
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image

# https://discuss.pytorch.org/t/torch-utils-data-dataset-random-split/32209/3
class PokemonDataset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform
        
    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y
        
    def __len__(self):
        return len(self.subset)

# Data augmentations
data_transforms = transforms.Compose(
    [
        transforms.Resize(512),
        transforms.RandomRotation(15),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(224),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]
)

test_transforms = transforms.Compose(
    [
        transforms.Resize(512),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]
)

In [None]:
data = torchvision.datasets.ImageFolder(root='data/pokemon')
train_set, val_set, test_set = torch.utils.data.random_split(data, [0.7, 0.15, 0.15], generator)

train_set = PokemonDataset(train_set, transform=data_transforms)
val_set = PokemonDataset(val_set, transform=data_transforms)
test_set = PokemonDataset(test_set, transform=test_transforms)

image_datasets = {
    'train': train_set,
    'val': val_set,
    'test': test_set
}
dataloaders = {
    x: torch.utils.data.DataLoader(image_datasets[x], batch_size=BATCH_SIZE, shuffle=True, num_workers=16)
    for x in ['train', 'val', 'test']
}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = data.classes

##### Training Code

In [4]:
def train(model, optimizer, criterion, scheduler):
    model.train()
    running_loss = 0.0
    running_corrects = 0

    for images, labels in dataloaders['train']:
        images, labels = images.to('cuda'), labels.to('cuda')

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        # backward
        loss.backward()
        optimizer.step()

        # statistics
        running_loss += loss.item() * images.size(0)
        running_corrects += torch.sum(preds == labels.data)

    scheduler.step()

    epoch_loss = running_loss / dataset_sizes['train']
    epoch_acc = running_corrects.double() / dataset_sizes['train']
    return epoch_loss, epoch_acc

def validate(model, criterion, option):
    model.eval()
    running_loss = 0.0
    running_corrects = 0

    with torch.no_grad():  # no gradient tracking
        for images, labels in dataloaders[option]:
            images, labels = images.to('cuda'), labels.to('cuda')

            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            running_corrects += torch.sum(preds == labels.data)

    epoch_loss = running_loss / dataset_sizes[option]
    epoch_acc = running_corrects.double() / dataset_sizes[option]
    return epoch_loss, epoch_acc

In [None]:

# reference: https://docs.pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
def train_model(model, optimizer, criterion, scheduler, num_epochs=NUM_EPOCHS):
    time_start = time.time()
    best_acc = 0.0
    writer = SummaryWriter()
    print('Training started...\n')

    for epoch in tqdm(range(num_epochs)):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        
        train_loss, train_acc = train(model, optimizer, criterion, scheduler)
        val_loss, val_acc = validate(model, criterion, 'val')

        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        print('-' * 10)
        print()

        # log data with tensorboard
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Loss/val', val_loss, epoch)
        writer.add_scalar('Accuracy/train', train_acc, epoch)
        writer.add_scalar('Accuracy/val', val_acc, epoch)

        # save best model
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
    
    time_elapsed = time.time() - time_start
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')
    writer.close()

##### Create Model

In [8]:
from torchvision.models import convnext_base, ConvNeXt_Base_Weights

model = convnext_base(weights=ConvNeXt_Base_Weights.DEFAULT)
# model.classifer[2] is the linear layer
num_feats = model.classifier[2].in_features
model.classifier[2] = torch.nn.Linear(num_feats, len(class_names)) # TODO: try dropout later
model = model.to('cuda')

##### Start Training

In [None]:
import warnings
warnings.filterwarnings('ignore', message="Palette images with Transparency expressed in bytes")

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS)

# train the convnext model
train_model(model, optimizer, criterion, scheduler, NUM_EPOCHS)

##### Load Model

In [9]:
model.load_state_dict(torch.load(best_model_path, weights_only=True))
model.eval()

ConvNeXt(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((128,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
          (1): Permute()
          (2): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=128, out_features=512, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=512, out_features=128, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
          (1): Permute()
          (2): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
          (3): Linear(

##### Accuracy on Test Set

In [11]:
criterion = torch.nn.CrossEntropyLoss()
test_loss, test_acc = validate(model, criterion, 'test')
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")



Test Loss: 1.3471 | Test Acc: 0.6631
