In [17]:
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
#imports for problem 2
import matplotlib.pyplot as plt 
from PIL import Image
import json
from mpl_toolkits.axes_grid1 import ImageGrid

#imports for problem 3
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
#from torchvision.datasets import ImageFolder
from sklearn.metrics import accuracy_score, balanced_accuracy_score

## Problem 3: Transfer Learning with a Pre-Trained CNN
Image classification using the Oxford Pet Dataset (37 categories with about 200 images in each of them).
Rather than using the final ‘softmax’ layer of the CNN as output to make predictions as we did in problem 2, instead we will use the CNN as a feature extractor to classify the Pets dataset. For each image, grab features from the last hidden layer of the neural network, which will be the layer before the 1000-dimensional output layer (around 500– 6000 dimensions). You will need to resize the images to a size compatible with your network (usually 224 × 224 × 3, but look at the documentation for the pre-trained system you selected). You should grab the output just after the last hidden layer or after global pooling (if it is 1000-dimensional, you will know you did it wrong).
After you extract these features for all of the images in the dataset, normalize them to unit length by dividing by the L2 norm. Train a linear classifier of your choice1 with the training CNN features, and then classify the test CNN features. Report mean-per-class accuracy and discuss the classifier you used.

In [18]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

#dataset and data loaders
train = torchvision.datasets.OxfordIIITPet(root='./data', split='trainval', transform=transform, download=True)
test = torchvision.datasets.OxfordIIITPet(root='./data', split='test', transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, shuffle=True)

print(f'- Trainset size: {len(train)}')
print(f'- Testset size: {len(test)}')
print(f'- Number of classes: {len(train.classes)}')
print(f'- Classes: {train.classes}')

#load pre-trained model, remove last fully connected layer and set to evaluation mode
model = models.resnet18(weights= models.ResNet18_Weights.IMAGENET1K_V1)
print(list(model.children())[-3])
print(list(model.children())[-2])
print(list(model.children())[-1])
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()
print('Evaluation mode set')

- Trainset size: 3680
- Testset size: 3669
- Number of classes: 37
- Classes: ['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Boxer', 'British Shorthair', 'Chihuahua', 'Egyptian Mau', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Keeshond', 'Leonberger', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Saint Bernard', 'Samoyed', 'Scottish Terrier', 'Shiba Inu', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']
Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
 

In [19]:
#extract features for training 
train_features = []
train_labels = []
for images, labels in train_loader:
    features = model(images)
    features = features.view(features.size(0), -1)
    train_features.append(features.detach().numpy())
    train_labels.append(labels.detach().numpy())
train_features = np.concatenate(train_features, axis=0)
train_labels = np.concatenate(train_labels, axis=0)

#extract features for test 
test_features = []
test_labels = []
for images, labels in test_loader:
    features = model(images)
    features = features.view(features.size(0), -1)
    test_features.append(features.detach().numpy())
    test_labels.append(labels.detach().numpy())
test_features = np.concatenate(test_features, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [22]:
print(f"Train features shape: {train_features.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Test labels shape: {test_labels.shape}")
#normalize train and test features
train_features_norm = np.linalg.norm(train_features, axis=1, keepdims=True)
train_features = train_features / train_features_norm
test_features_norm = np.linalg.norm(test_features, axis=1, keepdims=True)
test_features = test_features / test_features_norm

#train logistic regression classifier and predict classes for test data
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_features, train_labels)
predicted_labels = classifier.predict(test_features)

print(f"Predicted labels shape: {predicted_labels.shape}")

#compute overall accuracy
# accuracy = accuracy_score(test_labels, predicted_labels)
# print(f"Overall accuracy: {accuracy:.3f}")
#compute mean per class accuracy
class_accuracy = []
for i in range(len(train.classes)):
    class_accuracy.append(accuracy_score(test_labels[test_labels==i], predicted_labels[test_labels==i]))
print(f"Mean per class accuracy: {np.mean(class_accuracy):.3f}")
#computing mean-per-class accuracy using sklearn
accuracy = balanced_accuracy_score(test_labels, predicted_labels)
print(f"Mean-per-class accuracy using sklearn: {accuracy:.3f}")


Train features shape: (3680, 512)
Train labels shape: (3680,)
Test features shape: (3669, 512)
Test labels shape: (3669,)
Predicted labels shape: (3669,)
Mean per class accuracy: 0.894
Mean-per-class accuracy using sklearn: 0.894


In [21]:
print(f'List of each classes accuracies: \n{class_accuracy}')
classes = train.classes
print(f'corresponding to: \n{classes}')

List of each classes accuracies: 
[0.826530612244898, 0.91, 0.52, 0.9, 0.96, 0.88, 0.82, 0.9318181818181818, 0.9494949494949495, 0.77, 0.91, 0.8041237113402062, 0.92, 0.94, 0.98, 0.97, 0.98, 0.96, 1.0, 1.0, 0.82, 0.87, 0.98, 0.89, 0.93, 0.93, 0.74, 0.83, 0.99, 1.0, 0.98989898989899, 0.99, 0.89, 0.91, 0.5393258426966292, 0.91, 0.95]
corresponding to: 
['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Boxer', 'British Shorthair', 'Chihuahua', 'Egyptian Mau', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Keeshond', 'Leonberger', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Saint Bernard', 'Samoyed', 'Scottish Terrier', 'Shiba Inu', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']


- Discuss the classifier you used:

I have used a logistic regression classifier for this image classification task (with sklearn). It is a linear classifier that is usually used for binary classification tasks but it has been interesting to see how it would perform for multi-class classification by training it on the features extracted from the last hidden layer of the pre-trained CNN (Resnet-18).
The classifier uses the features extracted from ResNet-18 (output of the last hidden layer) as an input to predict the class labels of the images in the Oxford Pet Dataset. These features are first normalized to unit length by dividing by the L2 norm to scale the features and therefore improve the performance of the classifier.
Logistic regression is a linear classifier and may not be the ideal choice for complex datasets with non-linear relationships such as pet images, however we can see that combining it to a CNN (ResNet-18) has given acceptable results in this case. 


##### Sources:
- https://pytorch.org/vision/stable/models.html
- ChatGPT, StackExchange, StackOverflow
- https://learnopencv.com/pytorch-for-beginners-image-classification-using-pre-trained-models/ 
- https://www.kaggle.com/code/leifuer/intro-to-pytorch-loading-image-data
- https://wandb.ai/shweta/Activation%20Functions/reports/Activation-Functions-Compared-With-Experiments--VmlldzoxMDQwOTQ#the-mish-activation-function
- In-class MNIST Tutorial (google colab), CIFAR Tutorial
- https://pytorch.org/docs/stable/generated/torch.linalg.norm.html#torch.linalg.norm