In [None]:
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Initialize the model with the Trained Bi-Encoder for encoding addresses

In [None]:
model = SentenceTransformer('~/Bi-Encoder Fine-Tuned', device=device)

Load the Test Dataset

In [None]:
data_folder = '~/data'
test_data = pd.read_csv(data_folder + "test_data.csv")

The code defines a class called ``UnNormalizedDataset`` which inherits from ``torch.utils.data.Dataset``. The purpose of this class is to create a custom dataset that can be used to load data into a PyTorch model.

The __init__ method of the ``UnNormalizedDataset`` class takes a parameter ``data`` which represents the data to be loaded into the model. This data is stored as an attribute of the class instance called ``self.data``.

The __len__ method of the ``UnNormalizedDataset`` class returns the length of the data, which is the number of samples in the dataset.

The __getitem__ method of the ``UnNormalizedDataset`` class is used to retrieve a single sample from the dataset. It takes an index ``idx`` as a parameter, which represents the index of the sample to be retrieved. The method retrieves the "UnnormalizedAddress" value of the sample at the given index from the ``self.data`` attribute and returns it.

In [None]:
class UnNormalizedDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        morada_un_normalized = self.data.iloc[idx]['UnnormalizedAddress']
        return morada_un_normalized

Load the Test Data with the ``UnnormalizedDataset`` class

In [None]:
dataset = UnNormalizedDataset(test_data)

Create the ``Dataloader`` to make the batches from the data

In [None]:
dataset_dataloader = DataLoader(dataset=dataset, batch_size=128, shuffle=False)

Create the embeddings for the Test Dataset. <br>
Using batches as big as possible to faster encoding. <br>
Convert to tensor and save 

In [None]:
un_normalized_encoded = torch.Tensor().to(device=device)

for x in tqdm(iter(dataset_dataloader)):  
    encoded_batch = model.encode(x, batch_size=128, device=device, convert_to_tensor = True)
    un_normalized_encoded = torch.cat((un_normalized_encoded, encoded_batch), 0)
    

torch.save(un_normalized_encoded, '~/test_data_embeddings.pt')

Load the Normalized Database

In [None]:
data_folder = '~/data'
normalized_data = pd.read_csv(data_folder + "normalized_database.csv")

The code defines a class called ``NormalizedDataset`` which inherits from ``torch.utils.data.Dataset``. The purpose of this class is to create a custom dataset that can be used to load data into a PyTorch model.

The __init__ method of the ``NormalizedDataset`` class takes a parameter ``data`` which represents the data to be loaded into the model. This data is stored as an attribute of the class instance called ``self.data``.

The __len__ method of the ``NormalizedDataset`` class returns the length of the data, which is the number of samples in the dataset.

The __getitem__ method of the ``NormalizedDataset`` class is used to retrieve a single sample from the dataset. It takes an index ``idx`` as a parameter, which represents the index of the sample to be retrieved. The method retrieves the "NormalizedAddress" value of the sample at the given index from the ``self.data`` attribute and returns it.

In [None]:
class NormalizedDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        morada_normalized = self.data.iloc[idx]['NormalizedAddress']
        return morada_normalized

Load the Normalized Data with the ``NormalizedDataset`` class

In [None]:
normalized_dataset = NormalizedDataset(normalized_data)

Create the ``Dataloader`` to make the batches from the data

In [None]:
dataset_dataloader = DataLoader(dataset=normalized_dataset, batch_size=128, shuffle=False)

Create the embeddings for the Normalized Data. <br>
Using batches as big as possible to faster encoding. <br>
Convert to tensor and save 

In [None]:
normalized_encoded = torch.Tensor().to(device=device)

for x in tqdm(iter(dataset_dataloader)):  
    encoded_batch = model.encode(x, batch_size=128, device=device, convert_to_tensor = True)
    normalized_encoded = torch.cat((normalized_encoded, encoded_batch), 0)
    

torch.save(normalized_encoded, '~/normalized_data_embeddings.pt')