In [None]:
import torch
import pandas as pd
from tqdm import tqdm

<p>We define two PyTorch <code>Dataset</code> classes for loading data in a normalized and unnormalized format. Each of these classes inherits from the base <code>torch.utils.data.Dataset</code> class and overrides its <code>__init__</code>, <code>__len__</code>, and <code>__getitem__</code> methods.</p>

<h5><code>NormalizedDataset</code> Class</h5>

<p>The <code>NormalizedDataset</code> class takes a pandas DataFrame <code>data</code> as input in its constructor and initializes a <code>self.data</code> attribute with it. The <code>__len__</code> method simply returns the length of this data. The <code>__getitem__</code> method retrieves the normalized full address (<code>morada_normalized</code>), artery ID (<code>arteria</code>), and door ID (<code>porta</code>) for the index <code>idx</code> in the input data.</p>

<h5><code>UnNormalizedDataset</code> Class</h5>

<p>The <code>UnNormalizedDataset</code> class takes a pandas DataFrame <code>data</code> as input in its constructor and initializes a <code>self.data</code> attribute with it. The <code>__len__</code> method simply returns the length of this data. The <code>__getitem__</code> method retrieves the unnormalized full address (<code>morada_unormalized</code>) for the index <code>idx</code> in the input data.</p>

In [None]:
class NormalizedDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        morada_normalized = self.data.iloc[idx]['Morada_Full_Normalizada']
        arteria = self.data.iloc[idx]['ID_ARTERIA']
        porta = self.data.iloc[idx]['ID_PORTA']
        return {'morada': morada_normalized, 'arteria': arteria, 'porta': porta}


class UnNormalizedDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        morada_unormalized = self.data.iloc[idx]['Morada_Full']
        return morada_unormalized

Load the csv's for the normalized dataset and the test dataset and load them in the Dataset class we created for each one. <br>
Load the tensors that represent the data. (created in the ``3 - DB and Test Data to Tensor.ipynb`` script).

In [None]:
database_raw = pd.read_csv('~/data/normalized_database.csv')
database_raw = NormalizedDataset(database_raw)


# Test Data
test_raw = pd.read_csv('~/data/test_data.csv')
test_raw_dataset = UnNormalizedDataset(test_raw)

# Tensors
test_data_tensor = torch.load('~/test_data_embeddings.pt')
database_tensor = torch.load('~/normalized_data_embeddings.pt')

Using batch sizes as big as possible we calculate the cosine similarity of each address with all other addresses in the database<br>
For each one we then retrieve the highest similarity pair and in the end we write them in the results file, together witht the score given by the bi-encoder.

In [None]:
batch_size = 512

morada_normalizada_descoberta_bi = []
arteria_descoberta_bi = []
porta_descoberta_bi = []
scores_bi = []

for i in tqdm(range(0, test_data_tensor.shape[0], batch_size)):
    test_data_batch = test_data_tensor[i:i+batch_size,:]
    dot_product = torch.matmul(test_data_batch, database_tensor.transpose(1,0))
    norm1 = torch.norm(test_data_batch, dim=1, keepdim=True)
    norm2 = torch.norm(database_tensor, dim=1, keepdim=True)
    cosine_similarity_batch = dot_product / (norm1 * norm2.transpose(1,0))

    values, indices = cosine_similarity_batch.max(dim=1)
    values = values.cpu().numpy()

    # Get the top-1 most similar normalized address for each unnormalized one; its artery and door codes; and the score given by the bi-encoder 
    for j in range(indices.size(dim=0)):
        morada_normalizada_descoberta_bi.append(database_raw[indices[j].item()]['morada'])
        arteria_descoberta_bi.append(database_raw[indices[j].item()]['arteria'])
        porta_descoberta_bi.append(database_raw[indices[j].item()]['porta'])
        scores_bi.append(values[j])


pd.options.mode.chained_assignment = None


# Append the New Columns to the Original Test Data csv file.
test_raw['Morada Descoberta BI'] = morada_normalizada_descoberta_bi
test_raw['Arteria Descoberta BI'] = arteria_descoberta_bi
test_raw['Porta Descoberta BI'] = porta_descoberta_bi
test_raw['Match Confidence BI'] = scores_bi

test_raw.to_csv('test_data_results.csv',encoding = 'utf-8', index = False)