In [10]:
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
#imports for problem 2
import matplotlib.pyplot as plt 
from PIL import Image
import json
from mpl_toolkits.axes_grid1 import ImageGrid

#imports for problem 3
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
#from torchvision.datasets import ImageFolder
from sklearn.metrics import accuracy_score, balanced_accuracy_score

#imports for problem 5
from timm import create_model

## Problem 5: Vision Transformers
Instead of transfer learning with a CNN, I did a transfer learning with a Vision Transformer. I identified a strong vision transformer architecture for transfer learning that was pre-trained on ImageNet-1K (note that some pre-trained models are pre-trained on larger datasets). We suggest using Swin Transformers or later incarnations of them.

In [7]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

#dataset and data loaders
train = torchvision.datasets.OxfordIIITPet(root='./data', split='trainval', transform=transform, download=True)
test = torchvision.datasets.OxfordIIITPet(root='./data', split='test', transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, shuffle=True)

print(f'- Trainset size: {len(train)}')
print(f'- Testset size: {len(test)}')
print(f'- Number of classes: {len(train.classes)}')
print(f'- Classes: {train.classes}')

#load pre-trained model, remove last fully connected layer and set to evaluation mode
model_name = 'swin_tiny_patch4_window7_224'
num_classes = 1000
pretrained = True
model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)
print(list(model.children())[-2])
print(list(model.children())[-1])

model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()
print('Evaluation mode set')

- Trainset size: 3680
- Testset size: 3669
- Number of classes: 37
- Classes: ['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Boxer', 'British Shorthair', 'Chihuahua', 'Egyptian Mau', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Keeshond', 'Leonberger', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Saint Bernard', 'Samoyed', 'Scottish Terrier', 'Shiba Inu', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth" to /Users/aya/.cache/torch/hub/checkpoints/swin_base_patch4_window7_224_22kto1k.pth


LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Linear(in_features=1024, out_features=1000, bias=True)
Evaluation mode set


In [8]:
#extract features for training 
train_features = []
train_labels = []
for images, labels in train_loader:
    features = model(images)
    features = features.view(features.size(0), -1)
    train_features.append(features.detach().numpy())
    train_labels.append(labels.detach().numpy())
train_features = np.concatenate(train_features, axis=0)
train_labels = np.concatenate(train_labels, axis=0)

#extract features for test 
test_features = []
test_labels = []
for images, labels in test_loader:
    features = model(images)
    features = features.view(features.size(0), -1)
    test_features.append(features.detach().numpy())
    test_labels.append(labels.detach().numpy())
test_features = np.concatenate(test_features, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [9]:
print(f"Train features shape: {train_features.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Test labels shape: {test_labels.shape}")
#normalize train and test features
train_features_norm = np.linalg.norm(train_features, axis=1, keepdims=True)
train_features = train_features / train_features_norm
test_features_norm = np.linalg.norm(test_features, axis=1, keepdims=True)
test_features = test_features / test_features_norm

#train logistic regression classifier and predict classes for test data
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_features, train_labels)
predicted_labels = classifier.predict(test_features)

print(f"Predicted labels shape: {predicted_labels.shape}")

#compute overall accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Overall accuracy: {accuracy:.3f}")
#compute mean per class accuracy
class_accuracy = []
for i in range(len(train.classes)):
    class_accuracy.append(accuracy_score(test_labels[test_labels==i], predicted_labels[test_labels==i]))
print(f"Mean per class accuracy: {np.mean(class_accuracy):.3f}")
#computing mean-per-class accuracy using sklearn
accuracy = balanced_accuracy_score(test_labels, predicted_labels)
print(f"Mean-per-class accuracy using sklearn: {accuracy:.3f}")


Train features shape: (3680, 50176)
Train labels shape: (3680,)
Test features shape: (3669, 50176)
Test labels shape: (3669,)
Predicted labels shape: (3669,)
Overall accuracy: 0.938
Mean per class accuracy: 0.938
Mean-per-class accuracy using sklearn: 0.938


In [14]:
print(f'List of each classes accuracies: \n{class_accuracy}')
classes = train.classes
print(f'corresponding to: \n{classes}')

List of each classes accuracies: 
[0.9285714285714286, 0.95, 0.72, 0.98, 0.98, 0.96, 0.89, 0.9886363636363636, 0.9191919191919192, 0.74, 0.96, 0.7835051546391752, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9797979797979798, 1.0, 0.83, 0.94, 1.0, 0.93, 0.99, 0.99, 0.75, 0.84, 0.99, 1.0, 1.0, 0.99, 0.95, 0.99, 0.7865168539325843, 0.96, 0.99]
corresponding to: 
['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Boxer', 'British Shorthair', 'Chihuahua', 'Egyptian Mau', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Keeshond', 'Leonberger', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Saint Bernard', 'Samoyed', 'Scottish Terrier', 'Shiba Inu', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']


In [19]:
#copy pasting mean per class accuracies found in problem 3
class_acc_pb3 = [0.826530612244898, 0.91, 0.52, 0.9, 0.96, 0.88, 0.82, 0.9318181818181818, 0.9494949494949495, 0.77, 0.91, 
0.8041237113402062, 0.92, 0.94, 0.98, 0.97, 0.98, 0.96, 1.0, 1.0, 0.82, 0.87, 0.98, 0.89, 0.93, 0.93, 0.74, 0.83, 0.99, 1.0, 
0.98989898989899, 0.99, 0.89, 0.91, 0.5393258426966292, 0.91, 0.95]
n = len(class_accuracy)
best_vt, best_cnn, same_perf = [], [], []
for i in range(n):
    if class_accuracy[i] > class_acc_pb3[i]:
        best_vt.append(classes[i])
    elif class_accuracy[i] < class_acc_pb3[i]:
        best_cnn.append(classes[i])
    elif class_accuracy[i] == class_acc_pb3[i]:
        same_perf.append(classes[i])

print(f'Using a Vision Transformer (Swin Transformer) gives a better performance for the following classes: \n {best_vt}')
print(f'Using a ResNet18 instead gives a better performance for the following classes: \n {best_cnn}')
print(f'The two methods have the same accuracy for the following classes: \n {same_perf}')

Using a Vision Transformer (Swin Transformer) gives a better performance for the following classes: 
 ['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Chihuahua', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Scottish Terrier', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']
Using a ResNet18 instead gives a better performance for the following classes: 
 ['Boxer', 'British Shorthair', 'Egyptian Mau', 'Keeshond']
The two methods have the same accuracy for the following classes: 
 ['Leonberger', 'Saint Bernard', 'Samoyed', 'Shiba Inu']


- Briefly discuss the Vision Transformer you selected. 

As we are handling an image classification task, I have used Swin Transformer. It is an improved version of the original Vision Transformer model for image classification tasks. 
It 'builds hierarchical feature maps by merging image patches in deeper layers and has linear computation complexity to input image size due to computation of self-attention only within each local window' (https://paperswithcode.com/method/swin-transformer). In other words, its hierarchical design can efficiently handle large images by dividing them into smaller patches and processing them with a window-based mechanism. This allows the transformer to capture both local and global contextual information from the images while reducing computational cost and memory requirement, explaining its improved accuracy and ability to generalize well.
Moreover, it has has been pretrained on ImageNet-1K and has shown good performance, performing better than other models like CNNs (which we have proved above) and other transformer-based architectures.

- Compare performance of the Vision Transformer to the CNN you used earlier in terms of overall performance. 

The Swin transformer has achieved an overall performance of 93.8% compared to 89.4% for the CNN previously used (resnet-18) which is a better performance as expected (+3.4%). Its hierarchical design and ability to capture both local and global contextual information as stated before are probably the reasons to this higher accuracy.

- Are there some images that the CNN gets correct but the Swin Transformer misclassifies, and vice versa?

Using a Vision Transformer (Swin Transformer) gives a better performance for the following classes: 
 ['Abyssinian', 'American Bulldog', 'American Pit Bull Terrier', 'Basset Hound', 'Beagle', 'Bengal', 'Birman', 'Bombay', 'Chihuahua', 'English Cocker Spaniel', 'English Setter', 'German Shorthaired', 'Great Pyrenees', 'Havanese', 'Japanese Chin', 'Maine Coon', 'Miniature Pinscher', 'Newfoundland', 'Persian', 'Pomeranian', 'Pug', 'Ragdoll', 'Russian Blue', 'Scottish Terrier', 'Siamese', 'Sphynx', 'Staffordshire Bull Terrier', 'Wheaten Terrier', 'Yorkshire Terrier']
Using a ResNet18 instead gives a better performance for the following classes: 
 ['Boxer', 'British Shorthair', 'Egyptian Mau', 'Keeshond']
The two methods have the same accuracy for the following classes: 
 ['Leonberger', 'Saint Bernard', 'Samoyed', 'Shiba Inu']


##### Sources:
- https://pytorch.org/vision/stable/models.html
- ChatGPT, StackExchange, StackOverflow
- https://learnopencv.com/pytorch-for-beginners-image-classification-using-pre-trained-models/ 
- https://www.kaggle.com/code/leifuer/intro-to-pytorch-loading-image-data
- https://wandb.ai/shweta/Activation%20Functions/reports/Activation-Functions-Compared-With-Experiments--VmlldzoxMDQwOTQ#the-mish-activation-function
- In-class MNIST Tutorial (google colab), CIFAR Tutorial
- https://pytorch.org/docs/stable/generated/torch.linalg.norm.html#torch.linalg.norm
- https://medium.com/pythoneers/vision-transformers-an-innovative-approach-to-image-processing-3387c398d67f
- https://paperswithcode.com/method/swin-transformer
- https://towardsdatascience.com/batch-norm-explained-visually-how-it-works-and-why-neural-networks-need-it-b18919692739

