In [None]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
EMBEDDING_MODEL = 'thenlper/gte-large'
DELIMITER=" "
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 128
K = 1000

## Datasets

Uncomment the dataset you want work on.

In [None]:
# # ml-1m
# def ml_preprocessing(title):
#     title = " ".join(title.split(" ")[:-1]).strip()
#     if title.endswith(", The"):
#         title = "The " + title[:-5] 
#     if title.endswith(", A"):
#         title = "A " + title[:-3] 
#     return title

# data_path = "ML-1M/ml-1m.txt"
# titles_path = "ML-1M/ml-1m-titles.txt"
# title_freq_path = "ML-1M/ml-1m-train_item_freq.txt"
# similarity_indices_out = f"ML-1M/ml-1m-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
# similarity_values_out = f"ML-1M/ml-1m-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
# embeddings_out = f"ML-1M/ml-1m-embeddings-{EMBEDDING_MODEL.replace('/','_')}.pt"
# timestamp_path = "ML-1M/ml-1m_timestamp.txt"
# preprocessing_title = ml_preprocessing

In [None]:
# Beauty
data_path = "Beauty/Beauty.txt"
titles_path = "Beauty/Beauty-titles.txt"
title_freq_path = "Beauty/Beauty-train_item_freq.txt"
similarity_indices_out = f"Beauty/Beauty-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
similarity_values_out = f"Beauty/Beauty-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
timestamp_path = f"Beauty/Beauty-{EMBEDDING_MODEL.replace('/','_')}_timestamp.txt"
preprocessing_title = lambda t: t

In [None]:
# # Tools
# data_path = "Tools/Tools.txt"
# titles_path = "Tools/Tools-titles.txt"
# title_freq_path = "Tools/Tools-train_item_freq.txt"
# similarity_indices_out = f"Tools/Tools-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
# similarity_values_out = f"Tools/Tools-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
# timestamp_path = "Tools/Tools_timestamp.txt"
# preprocessing_title = lambda t: t

In [None]:
# # HomeKitchen
# data_path = "HomeKitchen/HomeKitchen.txt"
# titles_path = "HomeKitchen/HomeKitchen-titles.txt"
# title_freq_path = "HomeKitchen/HomeKitchen-train_item_freq.txt"
# similarity_indices_out = f"HomeKitchen/HomeKitchen-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
# similarity_values_out = f"HomeKitchen/HomeKitchen-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
# timestamp_path = "HomeKitchen/HomeKitchen_timestamp.txt"
# preprocessing_title = lambda t: t

In [None]:
# # Pet Supplies
# data_path = "PetSupplies/Pet.txt"
# titles_path = "PetSupplies/Pet-titles.txt"
# title_freq_path = "PetSupplies/Pet-train_item_freq.txt"
# similarity_indices_out = f"PetSupplies/Pet-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
# similarity_values_out = f"PetSupplies/Pet-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
# timestamp_path = "PetSupplies/Pet_timestamp.txt"
# preprocessing_title = lambda t: t

In [None]:
# # Steam
# data_path = "Steam/steam.txt"
# titles_path = "Steam/steam-titles.txt"
# title_freq_path = "Steam/steam-train_item_freq.txt"
# similarity_indices_out = f"Steam/steam-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt"
# similarity_values_out = f"Steam/steam-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt"
# timestamp_path = f"Steam/steam_timestamp.txt"
# preprocessing_title = lambda t: t

## Calcualte Similarities

In [None]:
def sentence_transformer(model_name, batch_size, device):
    model = SentenceTransformer(model_name, device=device)
    def embed(sentences):
        embeddings = []
        batches = [sentences[x:x+batch_size] for x in range(0, len(sentences), batch_size)]
        for batch in tqdm(batches):
            embeddings.append(model.encode(batch, convert_to_numpy=False, convert_to_tensor=True))
        return torch.cat(embeddings, dim=0)
    return embed

embedding_func = sentence_transformer(model_name=EMBEDDING_MODEL, batch_size=batch_size, device=device)

In [None]:
titles_df = pd.read_csv(titles_path, names=['id', 'title'], delimiter=DELIMITER, escapechar="\\")
titles_df

In [None]:
id_to_freq_df = pd.read_csv(title_freq_path, names=['id', 'freq'], delimiter=DELIMITER)
id_to_freq_series = pd.Series(id_to_freq_df.freq.values, index=id_to_freq_df.id)
id_to_freq = id_to_freq_series.to_dict()
titles_df['freq'] = id_to_freq_series
titles_df = titles_df[['id', 'freq', 'title']]
titles_df

In [None]:
titles_df['title'] = titles_df['title'].apply(np.vectorize(preprocessing_title))
titles_df

In [None]:
titles_list = titles_df['title'].tolist()
titles_embeddings = embedding_func(titles_list)
titles_embeddings

In [None]:
titles_embeddings.shape

In [None]:
def get_similarity_matrix(emebddings, eps=1e-8, top_k=None):
    embeddings_norm = emebddings.norm(dim=1).unsqueeze(dim=1)  # (num_embeddings, 1)
    embeddings_normalized = emebddings /  torch.max(embeddings_norm, eps * torch.ones_like(embeddings_norm))
    if top_k is None:
        similarity_values = embeddings_normalized @ embeddings_normalized.T
        # fix numerical percison issues - where similarity_matrix[i,i] < similarity_matrix[i, k != i]
        similarity_values += torch.diag(torch.full((similarity_values.shape[0],), 1e-7, device=device))
        similarity_indices = torch.arange(similarity_values.shape[0]).unsqueeze(dim=0).repeat(similarity_values.shape[0], 1)

    else:
        n_embeddings = emebddings.shape[0]
        chunks = n_embeddings // 1000
        value_list = []
        indices_list = []
        for chunk in embeddings_normalized.chunk(chunks):
            similarity_out = chunk @ embeddings_normalized.T 
            values, indices = torch.topk(similarity_out, dim= -1, k=top_k, sorted=True)
            value_list.append(values)
            indices_list.append(indices)
        similarity_values = torch.cat(value_list, dim=0)
        similarity_indices = torch.cat(indices_list, dim=0)

    return similarity_values, similarity_indices

In [None]:
similarity_values, similarity_indices = get_similarity_matrix(titles_embeddings, top_k=K)
print(similarity_indices)
similarity_values

Save all embeddings and similarities

In [None]:
torch.save(similarity_indices, similarity_indices_out)
torch.save(similarity_values, similarity_values_out)

In [None]:
! echo `date +'%I_%M_%d_%m'` > {timestamp_path}