In [1]:
!pip install -q pytorch_lightning

[K     |████████████████████████████████| 585 kB 12.8 MB/s 
[K     |████████████████████████████████| 141 kB 55.9 MB/s 
[K     |████████████████████████████████| 596 kB 60.1 MB/s 
[K     |████████████████████████████████| 419 kB 41.2 MB/s 
[?25h

In [2]:
!pip install -q pyspark

[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[K     |████████████████████████████████| 199 kB 53.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
## Databricks notebook source
import os
import glob
from PIL import Image
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder
from torch.multiprocessing import cpu_count
import pytorch_lightning as pl
from torch.multiprocessing import cpu_count
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from sklearn.preprocessing import normalize
from pyspark.sql.functions import col
from joblib import dump, load
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load the embeddings from the previously trained SimCLR model into a new resnet model for inference
resnet18_new = torchvision.models.resnet18()
backbone_new = nn.Sequential(*list(resnet18_new.children())[:-1])
ckpt = torch.load('/content/drive/My Drive/individual_rec_models/individ_rec_modelsandhelpers/simclrresnet18embed.pth')
backbone_new.load_state_dict(ckpt['resnet18_parameters'])

<All keys matched successfully>

In [6]:
def generate_embeddings(model, device, dl):
    embeddings = []
    labels = []
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for im, label in dl:
          im = im.to(device)
          embed = model(im).flatten(start_dim=1)
          embeddings.append(embed)
          labels.append(label)
    embeddings = torch.cat(embeddings, 0)
    embeddings = embeddings.cpu()
    embeddings = normalize(embeddings)
    embeddings = np.array(embeddings)
    labels = np.array([x.item() for x in labels])
    return embeddings, labels

In [7]:
# Define a transformation pipeline to be used to generate the embeddings
class_transform = torchvision.transforms.Compose([
            torchvision.transforms.Resize((224, 224)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=[0.485, 0.456,0.406], std=[0.229,0.224,0.225]),
        ])

In [8]:
class CustomDataset(Dataset):
    """A custom class to create the transformed datasets"""
    
    def __init__(self, root, folder, transform, mappings=None):
        super().__init__()
        # The path to the root directory and the specific folder in it
        self.root = root
        self.folder = folder
        # The images to be included
        self.img_dir = glob.glob(os.path.join(self.root, self.folder)+'/*/*.jpg')
        
        # Get the unique animal identifiers
        self.animal_ids = self._get_animal_ids()
        
        if mappings:
            # Create the mapping of animal string ids to integer values, using the passed mappings
            # the passed mappings will be the same as the one produced by the training set
            self.class_to_idx = mappings
        else:
            # Create a list of integers for the labels
            self.animal_int_labels = [x for x in range(len(self.animal_ids))]
            self.class_to_idx = dict(zip(self.animal_ids, self.animal_int_labels))
        
        # Define the preprocessing function
        self.transform = transform
    
    def _get_animal_ids(self):
        """Create a set of unique animal ids in string form"""
        animal_ids = os.listdir(self.root)
        return list(animal_ids)
            
    def __len__(self):
        return len(self.img_dir)
    
    def __getiteminternal__(self, idx):
        # Get the path to the image
        img_path = self.img_dir[idx]
        im = Image.open(img_path).convert('RGB')
        new_im = self.transform(im)
        # Get the animal name from the full path
        animal_name = img_path.split('/')[-2]
        label = self.class_to_idx[animal_name]
        # Return the two processed versions of the input image and a dummy label
        return new_im, label
    
    def __getitem__(self, idx):
        return self.__getiteminternal__(idx)

In [9]:
# Create the datasets and custom dataloaders - set the batch size to 1 so that all examples are used
data_dir = '/content/drive/My Drive/leopard.coco/processed/'
training_dataset = ImageFolder(os.path.join(data_dir, 'train'), transform=class_transform)
val_dataset = CustomDataset(data_dir, 'val', transform=class_transform, mappings=training_dataset.class_to_idx)
test_dataset = CustomDataset(data_dir, 'test', transform=class_transform, mappings=training_dataset.class_to_idx)

In [10]:
# Create dataloaders of the training, validation, and test sets
train_dl = DataLoader(training_dataset, batch_size=1, num_workers=4, shuffle=False)
val_dl = DataLoader(val_dataset, batch_size=1, num_workers=4, shuffle=False)
test_dl = DataLoader(test_dataset, batch_size=1, num_workers=4, shuffle=False)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# Generate the embeddings for the train, validation, and test datasets
train_embeddings, train_labels = generate_embeddings(backbone_new, device, train_dl)

In [13]:
val_embeddings, val_labels = generate_embeddings(backbone_new,device,val_dl)
test_embeddings, test_labels = generate_embeddings(backbone_new,device,test_dl)

In [15]:
# Use K-fold cross validation to train the classifier since some classes will only have 1 example
cv = KFold(n_splits=5, random_state=1, shuffle=True)

# Define the parameter grid for the KNN model to be searched
knn_param_grid = [{'pca__n_components': [0.8, 0.9, 0.95, 0.99],
                   'KNN__n_neighbors': [1, 3, 5, 10], 
                   'KNN__weights': ['uniform', 'distance'], 
                   'KNN__metric': ['euclidean', 'manhattan', 'cosine']}]

# Define the pipe object to use in Grid Search
pipe_knn = Pipeline([('scaler', StandardScaler()), ('pca', PCA()), ('KNN', KNeighborsClassifier())])

# Create a grid search object and parameters to be searched
knn_grid_search = GridSearchCV(estimator=pipe_knn, param_grid=knn_param_grid, scoring='accuracy', cv=cv)

# Fit the data to the training data
knn_grid_search.fit(train_embeddings, train_labels)

# Get the best estimator from the grid search results
clf = knn_grid_search.best_estimator_

In [16]:
# Fit a KNN classifier to the 
knn_val_preds= clf.predict(val_embeddings)
print(f'KNN validation accuracy score: {accuracy_score(knn_val_preds, val_labels)}')

KNN validation accuracy score: 0.6017830609212481


In [17]:
# Fit a KNN classifier to the 
knn_test_preds= clf.predict(test_embeddings)
print(f'KNN test accuracy score: {accuracy_score(knn_test_preds, test_labels)}')

KNN test accuracy score: 0.6196136701337296


In [18]:
# Save the model
dump(clf, '/content/drive/My Drive/individual_rec_models/individ_rec_modelsandhelpers/Panthera_pardus_knn.joblib')

['/content/drive/My Drive/individual_rec_models/individ_rec_modelsandhelpers/Panthera_pardus_knn.joblib']

In [19]:
# Get the integer to string mapping and save as a joblib file
# Swap the keys and values in the name map to enable looking up the string ids
res = dict((v,k) for k,v in training_dataset.class_to_idx.items())
# Save the leopard id look up dict for use with the Flask API
dump(res, '/content/drive/My Drive/individual_rec_models/individ_rec_modelsandhelpers/leopard_id_map.joblib')

['/content/drive/My Drive/individual_rec_models/individ_rec_modelsandhelpers/leopard_id_map.joblib']