In [1]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import os

In [2]:
EMBEDDING_MODEL = 'thenlper/gte-large'
DELIMITER=" "
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 128
K = 1000

## Datasets

Uncomment the dataset you want work on.

In [17]:
# ml-1m
def ml_preprocessing(title):
    title = " ".join(title.split(" ")[:-1]).strip()
    if title.endswith(", The"):
        title = "The " + title[:-5] 
    if title.endswith(", A"):
        title = "A " + title[:-3] 
    return title
data_abs_path = "C:\\Users\\xenx9\\Desktop\\Dev\\level4-recsys-finalproject-hackathon-recsys-06-lv3\\data"
save_abs_path = "C:\\Users\\xenx9\\Desktop\\Dev\\level4-recsys-finalproject-hackathon-recsys-06-lv3\\save"
data_path = os.path.join(data_abs_path,"ratings.csv")
titles_path = os.path.join(data_abs_path,"movies.csv")
title_freq_path = os.path.join(save_abs_path,"results/ml-20m-train_item_freq.csv")
similarity_indices_out = os.path.join(save_abs_path,f"models/ml-20m-similarity-indices-{EMBEDDING_MODEL.replace('/','_')}.pt")
similarity_values_out = os.path.join(save_abs_path,f"models/ml-20m-similarity-values-{EMBEDDING_MODEL.replace('/','_')}.pt")
embeddings_out = os.path.join(save_abs_path,f"models/ml-20m-embeddings-{EMBEDDING_MODEL.replace('/','_')}.pt")
timestamp_path = os.path.join(save_abs_path,"results/ml-20m_timestamp.csv")
preprocessing_title = ml_preprocessing

## Calcualte Similarities

In [4]:
def sentence_transformer(model_name, batch_size, device):
    model = SentenceTransformer(model_name, device=device)
    def embed(sentences):
        embeddings = []
        batches = [sentences[x:x+batch_size] for x in range(0, len(sentences), batch_size)]
        for batch in tqdm(batches):
            embeddings.append(model.encode(batch, convert_to_numpy=False, convert_to_tensor=True))
        return torch.cat(embeddings, dim=0)
    return embed

embedding_func = sentence_transformer(model_name=EMBEDDING_MODEL, batch_size=batch_size, device=device)

In [25]:
titles_df = pd.read_csv(titles_path)
titles_df.columns = ['id', 'title', 'genres']

In [8]:
rating_df = pd.read_csv(data_path, escapechar="\\")
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [18]:
freq_df:pd.Series=rating_df.groupby('movieId')['userId'].count()
freq_df=freq_df.reset_index()
freq_df.columns=['id','freq']
freq_df.to_csv(title_freq_path)

In [27]:
id_to_freq

{'id': 'freq',
 '1': '49695',
 '2': '22243',
 '3': '12735',
 '4': '2756',
 '5': '12161',
 '6': '23899',
 '7': '12961',
 '8': '1415',
 '9': '3960',
 '10': '29005',
 '11': '18162',
 '12': '3845',
 '13': '1461',
 '14': '6022',
 '15': '2910',
 '16': '17394',
 '17': '20667',
 '18': '5203',
 '19': '20938',
 '20': '4084',
 '21': '24116',
 '22': '9928',
 '23': '4250',
 '24': '7971',
 '25': '21553',
 '26': '2755',
 '27': '1642',
 '28': '3154',
 '29': '8520',
 '30': '1281',
 '31': '9435',
 '32': '44980',
 '33': '65',
 '34': '32255',
 '35': '1493',
 '36': '20809',
 '37': '85',
 '38': '552',
 '39': '26254',
 '40': '886',
 '41': '4689',
 '42': '2467',
 '43': '2552',
 '44': '10697',
 '45': '8933',
 '46': '3205',
 '47': '43249',
 '48': '13046',
 '49': '236',
 '50': '47006',
 '51': '27',
 '52': '9320',
 '53': '150',
 '54': '956',
 '55': '939',
 '56': '70',
 '57': '2439',
 '58': '11549',
 '59': '191',
 '60': '6313',
 '61': '2281',
 '62': '19381',
 '63': '2334',
 '64': '2086',
 '65': '4864',
 '66': '237

In [35]:
id_to_freq_df = pd.read_csv(title_freq_path)
id_to_freq_series = pd.Series(id_to_freq_df.freq.values, index=id_to_freq_df.id)
id_to_freq = id_to_freq_series.to_dict()
titles_df['freq'] = titles_df['id'].map(id_to_freq)
titles_df = titles_df[['id', 'freq', 'title']]
titles_df

Unnamed: 0,id,freq,title
0,1,49695.0,Toy Story (1995)
1,2,22243.0,Jumanji (1995)
2,3,12735.0,Grumpier Old Men (1995)
3,4,2756.0,Waiting to Exhale (1995)
4,5,12161.0,Father of the Bride Part II (1995)
...,...,...,...
27273,131254,1.0,Kein Bund für's Leben (2007)
27274,131256,1.0,"Feuer, Eis & Dosenbier (2002)"
27275,131258,1.0,The Pirates (2014)
27276,131260,1.0,Rentun Ruusu (2001)


In [36]:
titles_df['title'] = titles_df['title'].apply(np.vectorize(preprocessing_title))
titles_df

Unnamed: 0,id,freq,title
0,1,49695.0,Toy Story
1,2,22243.0,Jumanji
2,3,12735.0,Grumpier Old Men
3,4,2756.0,Waiting to Exhale
4,5,12161.0,Father of the Bride Part II
...,...,...,...
27273,131254,1.0,Kein Bund für's Leben
27274,131256,1.0,"Feuer, Eis & Dosenbier"
27275,131258,1.0,The Pirates
27276,131260,1.0,Rentun Ruusu


In [37]:
titles_list = titles_df['title'].tolist()
titles_embeddings = embedding_func(titles_list)
titles_embeddings

  0%|          | 0/214 [00:00<?, ?it/s]

tensor([[ 0.0158,  0.0059,  0.0099,  ..., -0.0152,  0.0070, -0.0213],
        [ 0.0255, -0.0008, -0.0285,  ..., -0.0450,  0.0036, -0.0115],
        [ 0.0090, -0.0062,  0.0036,  ..., -0.0454, -0.0286, -0.0186],
        ...,
        [ 0.0230,  0.0168, -0.0091,  ..., -0.0248, -0.0359, -0.0196],
        [-0.0136, -0.0051, -0.0161,  ..., -0.0299, -0.0142, -0.0155],
        [-0.0347,  0.0038, -0.0233,  ..., -0.0054,  0.0006,  0.0184]],
       device='cuda:0')

In [38]:
titles_embeddings.shape

torch.Size([27278, 1024])

In [39]:
def get_similarity_matrix(emebddings, eps=1e-8, top_k=None):
    embeddings_norm = emebddings.norm(dim=1).unsqueeze(dim=1)  # (num_embeddings, 1)
    embeddings_normalized = emebddings /  torch.max(embeddings_norm, eps * torch.ones_like(embeddings_norm))
    if top_k is None:
        similarity_values = embeddings_normalized @ embeddings_normalized.T
        # fix numerical percison issues - where similarity_matrix[i,i] < similarity_matrix[i, k != i]
        similarity_values += torch.diag(torch.full((similarity_values.shape[0],), 1e-7, device=device))
        similarity_indices = torch.arange(similarity_values.shape[0]).unsqueeze(dim=0).repeat(similarity_values.shape[0], 1)

    else:
        n_embeddings = emebddings.shape[0]
        chunks = n_embeddings // 1000
        value_list = []
        indices_list = []
        for chunk in embeddings_normalized.chunk(chunks):
            similarity_out = chunk @ embeddings_normalized.T 
            values, indices = torch.topk(similarity_out, dim= -1, k=top_k, sorted=True)
            value_list.append(values)
            indices_list.append(indices)
        similarity_values = torch.cat(value_list, dim=0)
        similarity_indices = torch.cat(indices_list, dim=0)

    return similarity_values, similarity_indices

In [40]:
similarity_values, similarity_indices = get_similarity_matrix(titles_embeddings, top_k=K)
print(similarity_indices)
similarity_values

tensor([[    0, 15401,  3027,  ...,  6982,  4126, 10143],
        [    1,  9492,  2090,  ..., 18412,  4879, 20194],
        [    2,  3361, 14588,  ...,  7663,  3930, 17937],
        ...,
        [27275, 12561,  8502,  ..., 15820,   364, 13912],
        [27276, 20902, 18042,  ..., 22503, 21279, 21140],
        [ 4635, 10447, 27277,  ..., 26350, 10518, 17530]], device='cuda:0')


tensor([[1.0000, 0.9507, 0.9446,  ..., 0.7911, 0.7911, 0.7911],
        [1.0000, 0.8434, 0.8407,  ..., 0.7982, 0.7982, 0.7982],
        [1.0000, 0.9973, 0.8954,  ..., 0.7854, 0.7854, 0.7854],
        ...,
        [1.0000, 0.9568, 0.9363,  ..., 0.7925, 0.7924, 0.7924],
        [1.0000, 0.8671, 0.8542,  ..., 0.7742, 0.7742, 0.7742],
        [1.0000, 1.0000, 1.0000,  ..., 0.7983, 0.7983, 0.7983]],
       device='cuda:0')

Save all embeddings and similarities

In [41]:
torch.save(similarity_indices, similarity_indices_out)
torch.save(similarity_values, similarity_values_out)

In [42]:
! echo `date +'%I_%M_%d_%m'` > {timestamp_path}