# Pairwise Binary classification with Deep Learning

In this notebook we will train Multilayer Perceptron Neural Network to detect whether two images are from same class (car brand) or not. To train Perceptron, we will use image embeddings got from trained CNN model (SOTA models such as MobileNet).

# Set up

## Packages and requirements

In [1]:
# Major builtin libraries
import os
import gc
import time
import random
import typing as t
from copy import deepcopy
from collections import defaultdict

In [2]:
import warnings  # If you want to disable warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
# Classic packages for data manipulation and visualization
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

In [4]:
# Basic PyTorch
import torch
import torch.nn as nn
import torch.optim as optim  # Optimization algorithms and dynamic learning rate adjusting
import torch.nn.functional as F
# from torch.nn.modules.loss import _Loss  # For writing a custom Loss function
from torch.utils.data import DataLoader, Dataset  # For custom data presentation

In [5]:
# Utils
import joblib  # Pipelining, pickling (dump/load), parallel processing
from tqdm import tqdm  # Progress bar for training process
from tempfile import TemporaryDirectory

# Classic ML tools
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold  # Cross-Validation

In [6]:
# ML Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from torchmetrics.classification import MulticlassF1Score, F1Score # F1 metric for multiclass

In [7]:
# Torch Computer Vision tools for images processing
from torchvision.io import read_image
from torchvision.transforms.functional import to_pil_image, to_grayscale, to_tensor
from torchvision import models  # Pretrained models

In [8]:
# Albumentations is an OS library for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2
# import torchvision.transforms as T  # We can use torch augmentations instead

In [9]:
# Output text colorizing
from colorama import Back, Style

def print_highlighted(text: str, bgcolor=Back.YELLOW) -> None:
    """
    Function to print a text with colored background.
    """
    print(bgcolor + text + Style.RESET_ALL)

In [10]:
import wandb # MLOps platform to simplify and speed up the process of building ML models

In [11]:
wandb.login() # We log in via pop-up,
# wandb.login(key=api_key)  # but you can also log in manually with function args

[34m[1mwandb[0m: Currently logged in as: [33mremainedmind[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Configuration

In [12]:
CONFIG = {
    "seed": 2306,
    # "epochs": 20,
    "image_dimension": 256,  # Depends on pretrained model used
    "model_name": "SiamesePerceptron",  # Pretrained model we will use
    "embedding_size": 512,  # Embedding output size
    # "train_batch_size": 200,
    # "val_batch_size": 400,
    "learning_rate": 1e-3,
    "min_lr": 1e-8,
    "min_loss_delta": 1e-7, # To stop training on plateau
    "weight_decay": 1e-7,

}

In [13]:
wandb_run = wandb.init(project="cars-classification-project", config=CONFIG)

In [14]:
config = wandb.config
del CONFIG

Set Seed for Reproducibility

In [15]:
def set_seed(seed=42):
    """
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    """
    np.random.seed(seed)
    torch.manual_seed(seed)

    # When running on the CuDNN backend, two further options can be set
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False  # When False, this option makes CUDA reproducible, BUT the performance might suffer

    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(seed=config.seed)

# Data

For our MLP we will use image embeddings - output from CNN models. We already got this data before, so we just load it.

## Set data location

In [16]:
# config.repo = 'car-brands/'  # dataset name on Kaggle
config.repo = 'data/'  # dataset name on local device
# config.repo = 'car_brand_detection/'  # Google Collab

# config.root = '/kaggle/input/' + config.repo
# config.root = 'drive/MyDrive/' + config.repo
config.root = '../'  + config.repo

config.data_path = config.root + 'embeddings_and_labels.csv'
config.test_images_path = config.root + 'images/test'
config.test_labels = config.root + 'test_labels.csv'

config.mlp_model_path = 'saved_instances/SiamesePerceptron.pth'
config.save_model_to = f'{config.model_name}.pth'

In [17]:
df = pd.read_csv(config.data_path)
print(df.head())

          0         1         2         3         4         5         6  \
0 -0.001561  0.159091 -0.326709 -0.226603 -0.097365  0.204614 -0.142634   
1  0.229466 -0.186505 -0.329016 -0.650666  0.115301  0.149208  0.065496   
2 -0.203165  0.330612 -0.413488 -0.128357  0.013811  0.244605 -0.137738   
3 -0.052456  0.385235 -0.423291 -0.007358  0.031012  0.242327  0.118072   
4  0.092909  0.075529 -0.040490  0.137453  0.531280  0.219033 -0.100117   

          7         8         9  ...       503       504       505       506  \
0  0.208764  0.469837  0.069866  ...  0.410457 -0.024682  0.178265 -0.357340   
1  0.191487  0.428920 -0.016147  ...  0.195697  0.171068  0.037157  0.049534   
2  0.365148  0.385303  0.318809  ...  0.413026  0.040102  0.054195 -0.251548   
3  0.235620  0.464901  0.320997  ...  0.412529  0.015200  0.237078 -0.429586   
4  0.161784 -0.077920 -0.265161  ... -0.024520  0.143381 -0.099898  0.137324   

        507       508       509       510       511      label  
0 -

In [18]:
# Last column is a label, rest are embeddings
df.columns[-10:]

Index(['503', '504', '505', '506', '507', '508', '509', '510', '511', 'label'], dtype='object')

In [19]:
embeddings_bag = df[(df.columns[:-1])]
embeddings_bag.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.001561,0.159091,-0.326709,-0.226603,-0.097365,0.204614,-0.142634,0.208764,0.469837,0.069866,...,0.111356,0.410457,-0.024682,0.178265,-0.35734,-0.073279,0.190458,-0.060786,0.364546,0.824516
1,0.229466,-0.186505,-0.329016,-0.650666,0.115301,0.149208,0.065496,0.191487,0.42892,-0.016147,...,0.13534,0.195697,0.171068,0.037157,0.049534,-0.224402,0.225593,0.180314,0.073213,0.500426


In [20]:
labels = df[df.columns[-1]]

In [21]:
print(labels.unique()[:10])

['Acura_MDX' 'Alfa Romeo_Giulietta' 'Audi_100' 'Audi_80' 'Audi_A1'
 'Audi_A3' 'Audi_A4' 'Audi_A5' 'Audi_A6' 'Audi_A7']


In [22]:
config.num_of_classes = labels.nunique()

Now we apply One Hot Encoding

In [23]:
def apply_label_encoding(labels: t.Union[pd.Series, np.array],
                         encoder_name: os.path,
                         action='encode',
     ):
    """
    One Hot encoding. We apply encoding by replacing the label column in dataframe.
    As for decoding data back, we work with vector-array (as it's most likely to
    be a prediction result)
    """
    encoder = LabelEncoder()
    if action == 'encode':
        # We transform dataframe here. Nothing returns
        # data = data.with_columns(pl.DataFrame(encoder.fit_transform(data[column]), schema=['label']))
        encoder_name = f"{encoder_name}_LEncoder.pkl"
        if encoder_name in os.listdir():
            with open(encoder_name, "rb") as fp:
                encoder: LabelEncoder = joblib.load(fp)
            labels = encoder.transform(labels)
            print_highlighted("Encoded with existing encoder.")
        else:
            labels = encoder.fit_transform(labels)
            with open(encoder_name, "wb") as fp:
                joblib.dump(encoder, fp)
        return labels
    elif action == 'decode':
        # We pass vector here. Result is a vector
        with open(f"{encoder_name}_LEncoder.pkl", "rb") as fp:
            encoder: LabelEncoder = joblib.load(fp)
        return encoder.inverse_transform(labels)

In [24]:
labels = pd.DataFrame(apply_label_encoding(labels, action='encode', encoder_name="embeddings_labels"), columns=['label'])
print(np.unique(labels)[:10])

[43mEncoded with existing encoder.[0m
[0 1 2 3 4 5 6 7 8 9]


In [25]:
apply_label_encoding(labels, action='decode', encoder_name="embeddings_labels")

array(['Acura_MDX', 'Acura_MDX', 'Acura_MDX', ..., 'ZIL_5301_Bychok',
       'ZIL_5301_Bychok', 'ZIL_5301_Bychok'], dtype=object)

## Pytorch Dataset to run model on

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
shuffle_ratio = 21790/24564
train_embeddings_bag, test_embeddings_bag, train_labels, test_labels = train_test_split(embeddings_bag, labels, random_state=config.seed, train_size=shuffle_ratio, shuffle=False)

In [28]:
# If we shuffle the data, we need to reset indexes
train_labels.reset_index(drop=True, inplace=True)
test_labels.reset_index(drop=True, inplace=True)
test_embeddings_bag.reset_index(drop=True, inplace=True)
train_embeddings_bag.reset_index(drop=True, inplace=True)

## Feedforward model to process embeddings pairs

That's the schema of our network

In [29]:
class SiameseNetwork(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(in_features=embedding_size, out_features=1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=1024),
            nn.ReLU(),
            nn.Dropout(p=0.3, inplace=False),
            nn.Linear(in_features=1024, out_features=1),
            nn.Sigmoid()
        )

    def forward(self, x1, x2):
        square = (x1 - x2)**2
        square = square.to(torch.float32)
        # Pass the inputs through fully connected layers
        output = self.fc(square)
        return output

In [30]:
# But we will use trained and saved model
# perceptron_model = SiameseNetwork(config.embedding_size)

In [31]:
try:
    # Load weights from previously trained
    perceptron_model = torch.load(config.mlp_model_path, map_location=torch.device('cpu'))
except FileNotFoundError:
    print("No trained model found.")

### Device

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
perceptron_model = perceptron_model.to(device)
torch.cuda.empty_cache()

## Label prediction based of K Nearest

## How values match among nearest neighbors

In [33]:
from sklearn.neighbors import NearestNeighbors

In [34]:
neigh = NearestNeighbors(n_neighbors=7, metric='cosine')

We train Neighbors model on `train_embeddings_bag`, so the model will predict the indexes related with **that** array.

In [35]:
try:
    with open("saved_instances/KNeighbors_model.pkl", "rb") as fp:
        neigh: NearestNeighbors = joblib.load(fp)
    print_highlighted("Saved model was uploaded without training.")
except FileNotFoundError:
    neigh.fit(train_embeddings_bag.to_numpy())
    with open("saved_instances/KNeighbors_model.pkl", "wb") as fp:
        joblib.dump(neigh, fp)
    print_highlighted("Model was trained and saved.")

[43mSaved model was uploaded without training.[0m


In [36]:
found_nearest = neigh.kneighbors(test_embeddings_bag, return_distance=False)

In [37]:
torch.tensor(labels['label'])[found_nearest].shape

torch.Size([2774, 13])

In [38]:
def get_dataset_with_nearest(array_of_nearest: np.array):
    neighbors = []
    with tqdm(array_of_nearest, desc="Processing",unit="row") as process:
        for row_of_nearest in process:
            labels_of_closest = []
            for nearest_label in row_of_nearest:
                labels_of_closest.append(labels.iat[nearest_label, 0])  # Get scalar from current row and first column
            neighbors.append(labels_of_closest)
    label_array = pd.DataFrame(neighbors)
    return label_array

neighbors_labels = get_dataset_with_nearest(found_nearest)

Processing: 100%|██████████| 2774/2774 [00:00<00:00, 3282.13row/s]


In [39]:
print(neighbors_labels.head())

    0    1    2    3    4    5    6    7    8    9    10   11   12
0    0  408  279  280  582  423  279  593    5  194  465  226  143
1  138    0  131  229  138  138  131  229  350  350    5    5  350
2    0    0    0    0  176  176  176  176    0    0  356  176  176
3    1    1    1    1    1    1    1  192  192  481    1  115  192
4    1    1    1  287  287  528    1    1  481    1  225  483   21


In [40]:
neighbors_labels.nunique()

0     756
1     752
2     750
3     747
4     749
5     741
6     731
7     736
8     724
9     718
10    718
11    716
12    718
dtype: int64

All labels are covered.

Now we will check how do the neighbors labels match to each other.

In [41]:
def all_values_same(row):
    return all(row == row[0])

In [42]:
def some_values_same(row, number_of_same=3):
    unique_numbers = set(row)
    all_numbers = list(row)
    [all_numbers.remove(u) for u in unique_numbers]
    # Now we can select any number - list contains only numbers that represent the majority.
    try:
        value_to_compare = all_numbers[0]
    except IndexError:
        value_to_compare = row[0]

    return ((row == value_to_compare).sum() >= number_of_same)

In [43]:
neighbors_labels['are_labels_same'] = neighbors_labels.apply(all_values_same, axis=1)

In [44]:
neighbors_labels['three_are_same'] = neighbors_labels.loc[:, neighbors_labels.columns[:-1]].apply(some_values_same, axis=1)

Let's check how good our Neighbors model works

In [45]:
print(neighbors_labels)

        0    1    2    3    4    5    6    7    8    9   10   11   12  \
0       0  408  279  280  582  423  279  593    5  194  465  226  143   
1     138    0  131  229  138  138  131  229  350  350    5    5  350   
2       0    0    0    0  176  176  176  176    0    0  356  176  176   
3       1    1    1    1    1    1    1  192  192  481    1  115  192   
4       1    1    1  287  287  528    1    1  481    1  225  483   21   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2769  758  758  151  758  758  757  151  151  755  757  152  151  151   
2770  758  758  758  758  758  758  758  758  758  758  758  758  758   
2771  759  759  759  759  759  759  759  759  759  759  759  759  759   
2772  759  160  759  154  160  160  154  160  160  160  152  677  296   
2773  759  759  438  663  343  343  343  343  343  343  343  343  759   

      are_labels_same  three_are_same  
0               False           False  
1               False            True  
2  

In [46]:
print(f"In {len(neighbors_labels[neighbors_labels['are_labels_same']]) / len(neighbors_labels) * 100}% of data classes have all same class in its' nearest vectors")

In 18.925739005046864% of data classes have all same class in its' nearest vectors


In [47]:
print(f"In {len(neighbors_labels[neighbors_labels['three_are_same']]) / len(neighbors_labels) * 100}% of data classes have at least 3 of same class in its' nearest vectors")

In 96.39509733237203% of data classes have at least 3 of same class in its' nearest vectors


So, when we have target object to predict label for, we can take some odd number of neighbors (e.g. $N=5$), and then do a Vote using majority label! It is enough to have at least $\frac{N-1}{2}$ of same class to make a strong prediction. Let's see does it work at all.

In [48]:
# embeddings_bag = embeddings_bag.to_numpy()
# labels = labels.to_numpy()
# labels = apply_label_encoding(labels, action='decode', encoder_name="embeddings_labels")

In [49]:
from collections import Counter

In [50]:
def predict_with_nearest(all_nearest_indexes, target_labels, labels_of_train_data):

    correct = 0
    total = 0

    for i, row in enumerate(tqdm(all_nearest_indexes)):
        nearest_indexes = row
        nearest_classes = [labels_of_train_data[n].item() for n in nearest_indexes]
        counter = Counter(nearest_classes)
        predicted_class = counter.most_common(1)[0][0]

        target_label = target_labels[i]  # Actual class
        correct += int(target_label == predicted_class)
        total += 1
    print_highlighted(f"Accuracy is: {correct/total}")


In [51]:
# Here we get an array of size B x N, where B is a batch size we want to test at once; N is a number of neighbors (we will take all test dataset)
bag_of_nearest = neigh.kneighbors(test_embeddings_bag, return_distance=False)

In [52]:
predict_with_nearest(bag_of_nearest, test_labels.to_numpy(), train_labels.to_numpy(),)

100%|██████████| 2774/2774 [00:00<00:00, 44366.40it/s]

[43mAccuracy is: 0.8276856524873828[0m





In [53]:
del bag_of_nearest
del neighbors_labels

Let's test our Model by prediction the similarity between objects that are nearest in the vector area. For that case we will use our NearestNeighbors trained algorithm.

We will upload our backbone model to get embeddings of test photos.

In [54]:
config.embedding_model_path = 'saved_instances/ArcFace_mobilenet_v2.pth'

In [55]:
def get_input_feature_size(classifier: nn.Sequential) -> int:
    for module in classifier.modules():
        if isinstance(module, nn.Linear):
            return module.in_features

In [56]:
def get_model(model_name='resnet18', from_path=None, pretrained=True,) -> torch.nn.Module:
    """
        Multipurpose function to load the model. For our task we will use fully trained model. If you don't have such, you may
    download any pretrained model cut last layer - you both `pretrained` and `get_embeddings` set to True.
    :param model_name:
    :param get_embeddings: whether to cut the classifier layer
    :return:
    """
    if from_path:
        try:
            model = torch.load(from_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            return model
        except FileNotFoundError:
            raise
    elif model_name:
        model = getattr(models, model_name) # We use builtin function
        model = model(
            weights=('DEFAULT' if pretrained else None)
        )

    model.classifier = nn.Sequential(
        # nn.Dropout(p=0.3, inplace=True),
        nn.Linear(in_features=get_input_feature_size(model.classifier),
                  out_features=config.embedding_size, bias=True
                  ),

    )
    return model

In [57]:
embedding_model = get_model(
    from_path=config.embedding_model_path,
)

# Evaluation

In [58]:
f1_score = MulticlassF1Score(num_classes=config.num_of_classes)
f1_score = f1_score.to(device)

## Get embeddings online

Let's build a test dataset of images. Then, we will pass them through the backbone and try to predict a label. So, we will use everything: backbone CNN model, KNeighbors model, Binary Classifier model.

In [59]:
def get_file_path_by_id(file_id, dir=config.root):
    return os.path.join(dir, str(file_id) + ".jpg")

In [60]:
data_transforms = {
    # Only validation is needed.

    "val": A.Compose([
        #         A.ToRGB(),
        A.Resize(config.image_dimension, config.image_dimension),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0
        ),
        ToTensorV2()], p=1.)
}

In [61]:
class CustomImagesDataset(Dataset):
    """
    """
    def __init__(self, data: pd.DataFrame=None, images_path: os.path=None, labels_path:os.path=None, transform_images: A.Compose=None, encoder_name=None):
        """

        """
        super().__init__()
        assert (data is not None) or (labels_path is not None and images_path is not None)

        if data is None:
            data = pd.read_csv(labels_path)
            data['file_path'] = data['id'].apply(get_file_path_by_id, dir=images_path)

        self.images_paths = data['file_path']


        self.encoder_name = encoder_name if encoder_name else self.__hash__()  # We use hash as a unique name
        print_highlighted(f"Label Encoder saved with id `{self.encoder_name}`")
        self.labels = apply_label_encoding(labels=data['label'], action='encode', encoder_name=self.encoder_name)

        # self.labels = data['label']
        #         self.indexes = data['id'].values
        self.transform_images = transform_images
        self.__set_dataset_len()

    def __set_dataset_len(self):
        self.length = self.labels.shape[0] # Number of rows

    def __len__(self):
        """
        We calculate the len in another function, so that we are able to slice.
        """
        return self.length

    def __getitem__(self, index) -> tuple[torch.Tensor, int]:
        """ Function to return item by indexing the dataset """

        if not isinstance(index, int) and isinstance(index, slice):
            # It's not an index, but slice.
            # We will return the part of data by making a copy of the dataset
            index: slice
            self = deepcopy(self)
            self.length = index.stop  # Cut the length of dataset.
            self.labels = self.labels[:self.length]
            return self
        assert self.__len__() >= index

        image = to_pil_image(read_image(self.images_paths[index]))
        if self.transform_images:
            # Albumentations requires us to convert image to Numpy Array
            image = self.transform_images(image=np.array(image))['image']

        label = self.labels[index]
        return image, label

In [62]:
test_images_dataset = CustomImagesDataset(images_path=config.test_images_path, labels_path=config.test_labels, transform_images=data_transforms['val'], encoder_name='test_labels')

[43mLabel Encoder saved with id `test_labels`[0m
[43mEncoded with existing encoder.[0m


In [63]:
len(test_images_dataset)

2775

In [64]:
config.batch_size = 256

In [65]:
test_dataloader = DataLoader(
    test_images_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=os.cpu_count() % 4,
)

Note: while using Siamese Network, we are able to pass one vector as `X1` and array of vectors as `X2`. So, we can pass one target vector and batch of nearest vectors - thus we get batch of similarity ratio

In [66]:
# Example
k = 5
with torch.no_grad():
    print(perceptron_model(torch.rand(4, 1, 512), torch.rand(4, k, 512)).shape)
del k

torch.Size([4, 5, 1])


In [67]:
def get_probable_label(weights, labels) -> int:
    """
    This function is supposed to process two vectors:
    `labels` represents sequence of labels of neighbors, and values MAY REPEAT
    `weights` show how probable each value is.
    The main problem to solve here is that it might be two same labels in array (so it's more probable).
    :param weights:
    :param labels:
    :return:
    """
    probs = defaultdict(float)
    for i, label in enumerate(labels):
        probs[label.item()] += weights[i].item()

    label_with_max_proba, _ = max(probs.items(), key=lambda x: x[1])  # Iterate over values, but get the key.
    return label_with_max_proba


In [68]:
@torch.inference_mode()
def predict_with_weighted_nearest(X: torch.tensor, backbone_model, binary_clf_model, device):
    """
        Function to enhance the prediction of KNeighbors model. We still use nearest neighbors to get probable labels,
    and then, we use Binary Classificator MLP to compare neighbors embeddings with our target image embedding.
    :param X: image or sequence of images - normalized 3x256x256 vectors;
    :param backbone_model: CNN network without classifier layer - to get image embedding;
    :param binary_clf_model: embedding classifier that detects whether two vectors are of same class (label);
    :param device:
    :return: probabilities of predicted labels
    """
    backbone_model = backbone_model.to(device)
    binary_clf_model = binary_clf_model.to(device)

    if X.dim() == 3:
        # Means it's one image, not a batch
        X = torch.unsqueeze(X, dim=0)  # Turn it into batch
        print('unsqueezed.')
    elif X.dim() == 4:
        pass

    embedding = backbone_model(X)  # This vector is two-dimensional as it is a batch

    bag_of_nearest_indexes = neigh.kneighbors(embedding, return_distance=False)  # Shape is `B x k`, k are neighbors
    # bag_of_nearest_indexes = torch.tensor(bag_of_nearest_indexes).to(device)
    # print(bag_of_nearest_indexes)

    # Now we select embeddings by their indexes. For the case of indexing array by another array, numpy.take (https://numpy.org/doc/stable/reference/generated/numpy.take.html) works fine.
    # We're indexing the `N x 512` array by the `B x K` array, and the result is the `B x K x 512` array of embeddings.
    # Try following to see:
    # print(train_embeddings_bag.to_numpy().shape, bag_of_nearest_indexes.shape, np.take(train_embeddings_bag.to_numpy(), bag_of_nearest_indexes, axis=0).shape)

    batch_of_nearest_vectors = np.take(train_embeddings_bag.to_numpy(), bag_of_nearest_indexes, axis=0)
    batch_of_nearest_labels = torch.tensor(np.squeeze(np.take(train_labels.to_numpy(), bag_of_nearest_indexes, axis=0))) # Shape of (B, K)

    # Note: `batch_of_nearest` is 3-dimensional. If we are to compare nearest with the target embedding,
    # we have to adjust this vector to same dimension.
    embedding = torch.unsqueeze(embedding, dim=1) # From shape (B, 512) to (B, 1, 512)
    predicted_similarity = binary_clf_model(embedding, batch_of_nearest_vectors)
    predicted_similarity = torch.squeeze(predicted_similarity)  # From (B, K, 1) to (B, K)

    # Now we can get the highest probability, and apply it as our prediction. But there is also
    # more stable way: to sum probabilities of same class firstly.

    # We normalize the proba among the nearest (as they are too close originally), but it's not really necessary
    predicted_similarity = F.normalize(predicted_similarity, dim=1)


    # Now we are going to iterate over nearest to get most probable label per each item
    predictions = []
    for row_of_similarities, row_of_labels in zip(predicted_similarity, batch_of_nearest_labels):
        predictions.append(get_probable_label(weights=row_of_similarities, labels=row_of_labels))
    return torch.tensor(predictions)



In [69]:
def evaluate_models(dataloader) -> pl.DataFrame:
    total = 0
    correct = 0
    preds = []
    all_labels = []
    with tqdm(dataloader, desc="Processing...",unit="batch") as process:
        for x, labels in process:
            y_pred = predict_with_weighted_nearest(X=x, backbone_model=embedding_model, binary_clf_model=perceptron_model, device=device)
            correct += (labels == y_pred).int().sum().item()
            total += labels.size(0)
            preds.extend(list(y_pred.cpu().numpy()))
            all_labels.extend(list(labels.cpu().numpy()))

    print_highlighted(f"Accuracy is: {correct/total}; F1 score: {f1_score(torch.tensor(all_labels), torch.tensor(preds))}")
    return pl.DataFrame(zip(preds, all_labels), schema=["nn_Predicted", "Actual labels"])

In [70]:
print(evaluate_models(test_dataloader))

Processing...: 100%|██████████| 11/11 [02:32<00:00, 13.83s/batch]

[43mAccuracy is: 0.827027027027027; F1 score: 0.8107395768165588[0m
shape: (2_775, 2)
┌──────────────┬───────────────┐
│ nn_Predicted ┆ Actual labels │
│ ---          ┆ ---           │
│ i64          ┆ i64           │
╞══════════════╪═══════════════╡
│ 206          ┆ 0             │
│ 279          ┆ 0             │
│ 138          ┆ 0             │
│ 0            ┆ 0             │
│ …            ┆ …             │
│ 758          ┆ 758           │
│ 759          ┆ 759           │
│ 160          ┆ 759           │
│ 343          ┆ 759           │
└──────────────┴───────────────┘





## Prediction with ArcFace classifier

In [71]:
model = embedding_model

In [72]:
class ArcFaceLoss(nn.Module):
    def __init__(self, embedding_size, num_classes, margin=0.5, scale=64):
        super().__init__()
        self.embedding_size = embedding_size
        self.num_classes = num_classes
        self.margin = margin
        self.scale = scale
        self.weight = nn.Parameter(torch.ones([num_classes, embedding_size], dtype=torch.float32, device=device))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, embeddings, labels):
        # Normalize the embeddings and weights
        labels = labels.to(torch.long)

        embeddings = F.normalize(embeddings, dim=1)
        weights = F.normalize(self.weight, dim=1)

        # Compute the cosine similarity between embeddings and weights
        cosine = F.linear(embeddings, weights)

        # Calculate the theta (angle) values for each class
        theta = torch.acos(torch.clip(cosine, -1 + 1e-7, 1 - 1e-7))

        # Apply the ArcFace margin and calculate logits
        one_hot_labels = F.one_hot(labels, self.num_classes)

        # Easy margin
        target_logits = torch.cos(theta + self.margin * (1 - one_hot_labels.float()))

        logits = self.scale * torch.where(one_hot_labels.bool(), target_logits, cosine)

        probabilities = F.softmax(logits, dim=1)
        return probabilities  # We will calculate the cross-entropy loss later


In [73]:
criterion = ArcFaceLoss(
    embedding_size=config.embedding_size,
    num_classes=config.num_of_classes,
)
try:
    # Load weights
    criterion.load_state_dict(torch.load('saved_instances/Trained_ArcLoss_parameters.pth', map_location=device))
except FileNotFoundError:
    pass

In [74]:
@torch.inference_mode()
def evaluate_arcface_classifier(model, criterion, test_dataloader, device) -> pl.DataFrame:
    model.eval()  # Set the model to evaluation mode
    total = 0
    correct = 0
    preds = []
    all_labels = []

    with tqdm(test_dataloader, desc="Evaluation...", unit="batch") as process:
        for images, labels in process:
            images = images.to(device)
            labels = labels.to(device)
            embeddings = model(images)
            probabilities = criterion(embeddings, labels)
            _, predicted = torch.max(probabilities, 1)

            correct += (labels == predicted).int().sum().item()
            total += labels.size(0)
            preds.extend(list(predicted.cpu().numpy()))
            all_labels.extend(list(labels.cpu().numpy()))

    print_highlighted(f"Accuracy is: {correct/total}; F1 score: {f1_score(torch.tensor(all_labels), torch.tensor(preds))}")

    return pl.DataFrame(zip(preds, all_labels), schema=["ArcFace_Predicted", "Actual labels"])


In [75]:
print(evaluate_arcface_classifier(model, criterion, test_dataloader, device))

Evaluation...: 100%|██████████| 11/11 [02:20<00:00, 12.80s/batch]

[43mAccuracy is: 0.8648648648648649; F1 score: 0.8518863916397095[0m
shape: (2_775, 2)
┌───────────────────┬───────────────┐
│ ArcFace_Predicted ┆ Actual labels │
│ ---               ┆ ---           │
│ i64               ┆ i64           │
╞═══════════════════╪═══════════════╡
│ 206               ┆ 0             │
│ 0                 ┆ 0             │
│ 229               ┆ 0             │
│ 0                 ┆ 0             │
│ …                 ┆ …             │
│ 758               ┆ 758           │
│ 759               ┆ 759           │
│ 758               ┆ 759           │
│ 663               ┆ 759           │
└───────────────────┴───────────────┘





## Real testing on the photos from Internet

Now we will repeat this experiment with real photos from Internet. No models were trained on them

In [76]:
config.test_images_path = '../data/val_dataset_segmented'
config.test_labels = "../data/val_labels.csv"
test_images_dataset = CustomImagesDataset(
    images_path=config.test_images_path, labels_path=config.test_labels, transform_images=data_transforms['val'], encoder_name='test_labels'
)
test_dataloader = DataLoader(
    test_images_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=os.cpu_count() % 4,
)

[43mLabel Encoder saved with id `test_labels`[0m
[43mEncoded with existing encoder.[0m


In [77]:
knn_prediction = evaluate_models(test_dataloader)

Processing...: 100%|██████████| 1/1 [00:01<00:00,  1.77s/batch]

[43mAccuracy is: 0.6818181818181818; F1 score: 0.5256410837173462[0m





In [78]:
arcface_prediction = evaluate_arcface_classifier(model, criterion, test_dataloader, device)

Evaluation...: 100%|██████████| 1/1 [00:01<00:00,  1.33s/batch]

[43mAccuracy is: 0.7272727272727273; F1 score: 0.6111111640930176[0m





In [79]:
total_prediction = pl.concat([arcface_prediction.select(["ArcFace_Predicted"]), knn_prediction], how='horizontal',)

In [80]:
print(total_prediction.corr())

shape: (3, 3)
┌───────────────────┬──────────────┬───────────────┐
│ ArcFace_Predicted ┆ nn_Predicted ┆ Actual labels │
│ ---               ┆ ---          ┆ ---           │
│ f64               ┆ f64          ┆ f64           │
╞═══════════════════╪══════════════╪═══════════════╡
│ 1.0               ┆ 0.822567     ┆ 0.662563      │
│ 0.822567          ┆ 1.0          ┆ 0.497723      │
│ 0.662563          ┆ 0.497723     ┆ 1.0           │
└───────────────────┴──────────────┴───────────────┘


That means both models match with each other more than with actual data.

Now we decode prediction to see the result

In [81]:
(pl.DataFrame(list([
    apply_label_encoding(total_prediction.select(column), action='decode', encoder_name='test_labels') for column in total_prediction.columns
]), schema=total_prediction.columns))

ArcFace_Predicted,nn_Predicted,Actual labels
object,object,object
Acura_MDX,Acura_MDX,Acura_MDX
Toyota_Nadia,Toyota_Harrier,Daewoo_LANOS
Toyota_Nadia,Toyota_Funcargo,Daewoo_LANOS
Izh_2126,Geely_CK,Daewoo_NEXIA
KIA_Carens,KIA_Carens,KIA_Carens
Toyota_Allion,Toyota_Allion,KIA_Rio
Land Rover_Discovery,Land Rover_Discovery,Land Rover_Discovery
Land Rover_Range_Rover,Land Rover_Range_Rover,Land Rover_Range_Rover
Land Rover_Range_Rover_Sport,Land Rover_Range_Rover_Sport,Land Rover_Range_Rover_Sport
Lexus_NX,Lexus_NX,Lexus_NX
