### Necessary Imports

In [1]:
import os
import pandas as pd
import torch
import numpy as np
from torch import Tensor
from torch.nn import functional as F
from scipy.optimize import linear_sum_assignment

### Load the data

In [2]:
DATA = os.path.join("..","data")

In [3]:
train_df = pd.read_csv(os.path.join(DATA,"train_data.csv"))
test_df = pd.read_csv(os.path.join(DATA,"test_data.csv"))
train_embeddings = pd.read_csv(os.path.join(DATA, "train_embeddings.csv"))
test_embeddings = pd.read_csv(os.path.join(DATA, "test_embeddings.csv"))
train_labels = pd.read_csv(os.path.join(DATA, "train_labels.csv"))
test_labels = pd.read_csv(os.path.join(DATA, "test_labels.csv"))

In [4]:
print(train_df.shape, test_df.shape)
print(train_embeddings.shape, test_embeddings.shape)
print(train_labels.shape, test_labels.shape)

(128, 785) (10000, 785)
(128, 128) (10000, 128)
(128, 1) (10000, 1)


In [5]:
dtrain = torch.tensor(train_df.drop(columns=['label']).to_numpy(), dtype=torch.float32).div(255.0)
dtest = torch.tensor(test_df.drop(columns=['label']).to_numpy(), dtype=torch.float32).div(255.0)
print(dtrain.shape, dtest.shape)

torch.Size([128, 784]) torch.Size([10000, 784])


In [6]:
train_embeddings_tensor = torch.tensor(train_embeddings.to_numpy(), dtype=torch.float32)
test_embeddings_tensor = torch.tensor(test_embeddings.to_numpy(), dtype=torch.float32)
print(train_embeddings_tensor.shape, test_embeddings_tensor.shape)

torch.Size([128, 128]) torch.Size([10000, 128])


### Solution

- Exploit the fact that network used to generate the embeddings preserved the cosine similarity between the different samples.

$$
d(x_{i},x_{j}) \approx d(f(x_{i}),f(x_{j}))
$$

- The idea is to use the provided train set to create features for each embedding and image in the test data 
by calculating - for each sample - the cosine similarity between it and all the available samples in the training data.

- Since the cosine similarity is preserved the corresponding test embeddings and images should end up with similar features.

- To find the embedding of a given image $x$ we search for embedding $y$ with the highest similarity,the index of that embedding is : 

$$
\argmax_{y \in D'} d(x,y)
$$

### Calculate the cost matrix

In [7]:
def cos_sim(x : Tensor, y : Tensor) -> Tensor:
    
    x = x.view(x.size(0), -1)
    y = y.view(y.size(0), -1)

    x = F.normalize(x, p=2, dim=1)
    y = F.normalize(y, p=2, dim=1)

    return x @ y.T

In [8]:
x_sim = cos_sim(dtest, dtrain[train_labels.values.flatten()])
print(x_sim.shape)

torch.Size([10000, 128])


In [9]:
emb_sim = cos_sim(test_embeddings_tensor, train_embeddings_tensor)
print(emb_sim.shape)

torch.Size([10000, 128])


In [10]:
sim = cos_sim(emb_sim, x_sim)
print(sim.shape)

torch.Size([10000, 10000])


### Prediction

- Naive approach

In [11]:
predictions = sim.argmax(dim=1)
np.mean(predictions.numpy() == test_labels.values.flatten())

0.8362

- Optimize using assignment algorithm

In [12]:
cost_matrix = 1 - sim.numpy()
row_ind, col_ind = linear_sum_assignment(cost_matrix)

In [13]:
np.mean(col_ind == test_labels.values.flatten())

1.0