<a href="https://colab.research.google.com/github/cchummer/ml-dl-scratch/blob/main/collab_pmf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [3]:
ratings_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Preprocessing

In [4]:
# Grab movie names from other csv
names_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_movies.csv')
names_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# Merge into ratings df
ratings_df = ratings_df.merge(names_df, on='movieId')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [6]:
from sklearn.model_selection import train_test_split

#np.random.seed(42)
trn_df,val_df = train_test_split(ratings_df, test_size=0.25)

In [None]:
trn_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
6156,221,1080,4.5,1111178181,Monty Python's Life of Brian (1979),Comedy
2162,173,316,3.0,843397907,Stargate (1994),Action|Adventure|Sci-Fi
32079,587,440,4.0,953138612,Dave (1993),Comedy|Romance
8659,322,1265,4.0,1217676332,Groundhog Day (1993),Comedy|Fantasy|Romance
7965,368,1214,4.0,975828806,Alien (1979),Horror|Sci-Fi
...,...,...,...,...,...,...
31048,33,339,5.0,939719385,While You Were Sleeping (1995),Comedy|Romance
60395,19,1891,2.0,965705454,"Ugly, The (1997)",Horror|Thriller
100647,606,33310,4.5,1244571715,"Common Thread, A (a.k.a. Sequins) (Brodeuses) ...",Drama|Romance
16210,19,3809,3.0,965706415,What About Bob? (1991),Comedy


In [7]:
n_users = len(ratings_df.userId.unique())
n_movies = len(ratings_df.movieId.unique())
n_factors = 50

print(n_users)
print(n_movies)

610
9724


# Probabilistic Matric Factorization (dot-product based)

In [None]:
# Utilizing pytorch DataSet + DataLoader functionality for easy batch size control + ID mapping
class myCollabDataSet(Dataset):
  def __init__(self, df, user_col, item_col, score_col):

    # Create mappings of user and item ids to 0-based indices so they will play nicely with embedding lookups. Also allows for use of non-numeric columns (movie title, etc)
    self.user_col = user_col
    self.item_col = item_col
    self.score_col = score_col

    self.data = df
    self.user_mapping = {user_id: i for i, user_id in enumerate(self.data[self.user_col].unique())}
    self.item_mapping = {item_id: i for i, item_id in enumerate(self.data[self.item_col].unique())}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    user_id = self.data.iloc[idx][self.user_col]
    item_id = self.data.iloc[idx][self.item_col]
    rating = self.data.iloc[idx][self.score_col]

    # Return our index values rather than the raw ID's (which are potentially non-numeric)
    # This has the effect of indices rather than ID's being returned to forward() in the model below
    user_idx = self.user_mapping[user_id]
    item_idx = self.item_mapping[item_id]

    #user_idx_tensor = torch.tensor(user_idx, dtype=torch.long)
    #item_idx_tensor = torch.tensor(item_idx, dtype=torch.long)
    #ratings_tensor = torch.tensor(rating, dtype=torch.float32)

    #return [user_idx, item_idx, ratings_tensor]
    return torch.tensor([user_idx, item_idx, rating], dtype=torch.float32)

In [None]:
# Quick function to turn dataframes into optimized DataLoaders
def create_data_loaders(trn_df, val_df, user_col, item_col, score_col, batch_size=64):
  '''
  user_col: str name of column in dataframe containing user ids
  item_col: str name of column in dataframe containing item ids
  score_col: str name of column in dataframe containing ratings
  '''
  trn_ds = myCollabDataSet(trn_df, user_col, item_col, score_col)
  val_ds = myCollabDataSet(val_df, user_col, item_col, score_col)

  trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

  return trn_dl, val_dl, trn_ds, val_ds

In [8]:
def sigmoid_range(x, low, high):
  '''
  Sigmoid function with range `(low, high)`
  https://github.com/fastai/fastai/blob/master/fastai/layers.py#L100
  '''
  return torch.sigmoid(x) * (high - low) + low

In [None]:
# Define our model
class DotProductBias(nn.Module):
  def __init__(self, n_users, n_items, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = nn.Embedding(n_users, n_factors)
    self.user_bias = nn.Embedding(n_users, 1)
    self.item_factors = nn.Embedding(n_items, n_factors)
    self.item_bias = nn.Embedding(n_items, 1)
    self.y_range = y_range

    # Initialize embeddings and biases
    nn.init.normal_(self.user_factors.weight, std=0.01)
    nn.init.normal_(self.item_factors.weight, std=0.01)
    nn.init.normal_(self.user_bias.weight, std=0.01)
    nn.init.normal_(self.item_bias.weight, std=0.01)

  def forward(self, x):

    user_idx = x[:, 0].long()
    item_idx = x[:, 1].long()
    #ratings = x[:, 2]

    users = self.user_factors(user_idx)
    items = self.item_factors(item_idx)
    users_bias = self.user_bias(user_idx).squeeze()
    items_bias = self.item_bias(item_idx).squeeze()

    dot_product = torch.sum(users * items, dim=1)
    bias = users_bias + items_bias

    prediction = dot_product + bias
    return sigmoid_range(prediction, *self.y_range)

In [None]:
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders(trn_df, val_df, 'userId', 'title', 'rating')
test_model = DotProductBias(n_users, n_movies, n_factors)

In [None]:
# test again using whole dataset just for lols
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders(ratings_df, val_df, 'userId', 'title', 'rating')
test_model = DotProductBias(n_users, n_movies, n_factors)

In [9]:
def train_pytorch_model(model, train_loader, optimizer, criterion, epochs=5):
  for epoch in range(epochs):

    model.train()
    #running_loss = 0.0
    for batch in train_loader:

      targets = batch[:, 2]

      optimizer.zero_grad()

      outputs = model(batch)
      loss = criterion(outputs, targets)

      loss.backward()
      optimizer.step()

      #running_loss += loss.item()

    #print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [None]:
#optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005, weight_decay=0.1)
optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005)
criterion = nn.MSELoss()

In [None]:
train_pytorch_model(test_model, trn_dl, optimizer, criterion, epochs=10)

Epoch [1/10], Loss: 1.1064
Epoch [2/10], Loss: 0.3781
Epoch [3/10], Loss: 0.2302
Epoch [4/10], Loss: 0.1744
Epoch [5/10], Loss: 0.2189
Epoch [6/10], Loss: 0.1946
Epoch [7/10], Loss: 0.2011
Epoch [8/10], Loss: 0.2308
Epoch [9/10], Loss: 0.1249
Epoch [10/10], Loss: 0.1170


In [None]:
# Extract item bias embeddings from the model
item_bias_embeddings = test_model.item_bias.weight.squeeze().detach().numpy()

# Map the item bias embeddings back to their original item IDs
item_biases = [(item_id, item_bias_embeddings[item_idx]) for item_id, item_idx in trn_ds.item_mapping.items()]

# Sort items based on bias values
sorted_item_biases = sorted(item_biases, key=lambda x: x[1], reverse=True)

# Output the items with the highest bias
num_top_items = 10  # Adjust as needed
top_items_with_bias = sorted_item_biases[:num_top_items]
print("Top items with highest bias:")
for item_id, bias_value in top_items_with_bias:
  print(f"Item ID: {item_id}, Bias Value: {bias_value}")

Top items with highest bias:
Item ID: Yojimbo (1961), Bias Value: 1.0827828645706177
Item ID: Shining, The (1980), Bias Value: 0.9985012412071228
Item ID: Spotlight (2015), Bias Value: 0.9933627247810364
Item ID: Jaws (1975), Bias Value: 0.9353036284446716
Item ID: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Bias Value: 0.9299892783164978
Item ID: Star Wars: Episode VI - Return of the Jedi (1983), Bias Value: 0.8900007605552673
Item ID: Indiana Jones and the Last Crusade (1989), Bias Value: 0.8669297695159912
Item ID: Old Boy (2003), Bias Value: 0.8666865825653076
Item ID: Star Wars: Episode V - The Empire Strikes Back (1980), Bias Value: 0.86404949426651
Item ID: Star Wars: Episode IV - A New Hope (1977), Bias Value: 0.844457745552063


In [None]:
def calculate_item_item_similarity(model):
  model.eval()

  # Get item embeddings
  item_embeddings = model.item_factors.weight  # shape: (num_items, embedding_size)

  # Normalize item embeddings to unit length
  item_norms = torch.norm(item_embeddings, dim=1, keepdim=True)  # shape: (num_items, 1)
  item_embeddings_normalized = item_embeddings / item_norms

  # Calculate cosine similarities between all pairs of item embeddings
  cosine_similarities = torch.matmul(item_embeddings_normalized, item_embeddings_normalized.T)  # shape: (num_items, num_items)

  return cosine_similarities

In [None]:
cosine_sims = calculate_item_item_similarity(test_model)
print(cosine_sims)

tensor([[ 1.0000,  0.0631,  0.3525,  ...,  0.1955,  0.0486,  0.0423],
        [ 0.0631,  1.0000,  0.2073,  ..., -0.0404,  0.2447,  0.0211],
        [ 0.3525,  0.2073,  1.0000,  ..., -0.0030, -0.0297,  0.0110],
        ...,
        [ 0.1955, -0.0404, -0.0030,  ...,  1.0000,  0.0270,  0.2171],
        [ 0.0486,  0.2447, -0.0297,  ...,  0.0270,  1.0000, -0.0261],
        [ 0.0423,  0.0211,  0.0110,  ...,  0.2171, -0.0261,  1.0000]],
       grad_fn=<MmBackward0>)


In [None]:
# Choose movie to inspect cosine similarities
base_item_id = 'Lawrence of Arabia (1962)'
item_idx = trn_ds.item_mapping[base_item_id]

similar_items = torch.argsort(cosine_sims[item_idx], descending=True)

# Print top 5 similar items
top_k = 5
for i in range(1, top_k + 1):  # Skip the first item (itself)
    similar_item_idx = similar_items[i].item()
    similar_item_id = next(key for key, val in trn_ds.item_mapping.items() if val == similar_item_idx)

    similarity_score = cosine_sims[item_idx, similar_item_idx].item()
    print(f"Item: {base_item_id} is similar to Item: {similar_item_id} with similarity score: {similarity_score:.4f}")

Item: Lawrence of Arabia (1962) is similar to Item: Z (1969) with similarity score: 0.5245
Item: Lawrence of Arabia (1962) is similar to Item: Igby Goes Down (2002) with similarity score: 0.5042
Item: Lawrence of Arabia (1962) is similar to Item: Bridge on the River Kwai, The (1957) with similarity score: 0.5003
Item: Lawrence of Arabia (1962) is similar to Item: King of Kong, The (2007) with similarity score: 0.4934
Item: Lawrence of Arabia (1962) is similar to Item: I, Tonya (2017) with similarity score: 0.4923


In [None]:
def test_model_preds(model, val_dl):
  model.eval()

  with torch.no_grad():
    for batch in val_dl:

      target = batch[:, 2].numpy()

      predictions = model(batch).numpy()
      for pred, actual in zip(predictions, target):
        print(f"Predicted: {pred.item():.2f}, Actual: {actual.item()}")

In [None]:
test_model_preds(test_model, val_dl)

# Incorporating More Features
Now we will create embeddings for the movie 'genre' values given in the CSV we grabbed the names from

In [10]:
# Define a new dataset which will also hold genre data
class myCollabDataSetWithGenre(Dataset):
  def __init__(self, df, user_col, item_col, genre_col, score_col):

    self.user_col = user_col
    self.item_col = item_col
    self.genre_col = genre_col
    self.score_col = score_col

    self.data = df
    self.user_mapping = {user_id: i for i, user_id in enumerate(self.data[self.user_col].unique())}
    self.item_mapping = {item_id: i for i, item_id in enumerate(self.data[self.item_col].unique())}

    # Create our mapping of unique genres
    self.genres_list = self._get_unique_genres()
    self.genre_mapping = {genre: i for i, genre in enumerate(self.genres_list)}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    user_id = self.data.iloc[idx][self.user_col]
    item_id = self.data.iloc[idx][self.item_col]
    genres = self.data.iloc[idx][self.genre_col]
    rating = self.data.iloc[idx][self.score_col]

    user_idx = self.user_mapping[user_id]
    genre_idxs = [self.genre_mapping[genre] for genre in genres.split('|')] # Is now a list
    item_idx = self.item_mapping[item_id]

    # First thought here was to write [user_idx, item_idx, genre_idxs, rating], but using [] + creates a flat list, concatenating the elements of genre_idxs
    # rather than the list itself. Easier to parse inside the model
    return torch.tensor([user_idx, item_idx, rating] + genre_idxs, dtype=torch.float32)

  # Helper function called in initialization
  def _get_unique_genres(self):
    unique_genres = set()
    for genres in self.data[self.genre_col].unique():
      unique_genres.update(genres.split('|'))
    return list(unique_genres)



In [11]:
# Needed to handle variable length of samples' genre index lists. Pad tensors per batch
def collate_fn_with_padding(batch):

  # Extract individual components from the batch
  batch_size = len(batch)

  user_idxs = torch.zeros(batch_size, dtype=torch.long)
  item_idxs = torch.zeros(batch_size, dtype=torch.long)
  ratings = torch.zeros(batch_size, dtype=torch.float32)
  max_num_genres = max(len(item) - 3 for item in batch)  # Calculate max length of genre indices

  genre_idxs_padded = []

  for i, item in enumerate(batch):

    user_idxs[i] = item[0]
    item_idxs[i] = item[1]
    ratings[i] = item[2]
    genre_idxs = torch.tensor(item[3:], dtype=torch.long)
    padded_genre_idxs = torch.cat([genre_idxs, torch.zeros(max_num_genres - len(genre_idxs), dtype=torch.long)])
    genre_idxs_padded.append(padded_genre_idxs)

  genre_idxs_padded = torch.stack(genre_idxs_padded, dim=0)

  # Concatenate all tensors into a single tensor
  batch_tensor = torch.cat([user_idxs.unsqueeze(1),
                            item_idxs.unsqueeze(1),
                            ratings.unsqueeze(1),
                            genre_idxs_padded], dim=1)

  return batch_tensor

In [12]:
def create_data_loaders_with_genre(trn_df, val_df, user_col, item_col, genre_col, score_col, batch_size=64):
  '''
  genre_col: str name of column in dataframe containing genres
  '''
  trn_ds = myCollabDataSetWithGenre(trn_df, user_col, item_col, genre_col, score_col)
  val_ds = myCollabDataSetWithGenre(val_df, user_col, item_col, genre_col, score_col)

  trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_with_padding)
  val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_with_padding)

  return trn_dl, val_dl, trn_ds, val_ds

In [13]:
# Define our model
class DPBWithItemFeatures(nn.Module):
  '''
    Similar dot-product model but with room for an extra categorical/feature (currently genres), which is taken into account for the item
    Could easily be modified to handle more features, for either users or items
    '''
  def __init__(self, n_users, n_items, n_genres, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = nn.Embedding(n_users, n_factors)
    self.user_bias = nn.Embedding(n_users, 1)
    self.item_factors = nn.Embedding(n_items, n_factors)
    self.item_bias = nn.Embedding(n_items, 1)
    self.genre_factors = nn.Embedding(n_genres, n_factors)
    self.genre_bias = nn.Embedding(n_genres, 1)
    self.y_range = y_range

    # Initialize embeddings and biases
    nn.init.normal_(self.user_factors.weight, std=0.01)
    nn.init.normal_(self.item_factors.weight, std=0.01)
    nn.init.normal_(self.genre_factors.weight, std=0.01)
    nn.init.normal_(self.user_bias.weight, std=0.01)
    nn.init.normal_(self.item_bias.weight, std=0.01)
    nn.init.normal_(self.genre_bias.weight, std=0.01)

  def forward(self, x):

    user_idx = x[:, 0].long()
    item_idx = x[:, 1].long()
    # ratings = x[:, 2].long()
    genre_idxs = x[:, 3:].long() # Assuming genres now take up the 4th column and onward

    users = self.user_factors(user_idx)
    items = self.item_factors(item_idx)
    users_bias = self.user_bias(user_idx).squeeze()
    items_bias = self.item_bias(item_idx).squeeze()

    # Embedding lookup for genres
    genres_embedded = self.genre_factors(genre_idxs)

    # Currently summing biases of all the genres of the sample. Could also average
    genre_bias = self.genre_bias(genre_idxs).squeeze().sum(dim=1)

    # Multiple ways to use of the genre embeddings, especially if they are of different size than the item embeddings
    #item_with_genre = items.unsqueeze(1) * genres_embedded
    items_with_genre = torch.cat([items.unsqueeze(1) * genres_embedded, items.unsqueeze(1)], dim=1)

    # Sum the effect of each genre's factors on the item factors, reduce dimensionality from (n_samples, n_genres, n_item_factors) to (n_samples, n_item_factors)
    # This is assuming item embedding and genre embedding sizes are equal
    items_with_genre = items_with_genre.sum(dim=1) # or mean

    dot_product = torch.sum(users * items_with_genre, dim=1)
    bias = users_bias + items_bias + genre_bias

    prediction = dot_product + bias
    return sigmoid_range(prediction, *self.y_range)

In [14]:
# Get the number of unique genres, to create embeddings
unique_genres = set()
for genres in ratings_df.genres.unique():
  unique_genres.update(genres.split('|'))
n_genres = len(list(unique_genres))

n_genres

20

In [15]:
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders_with_genre(trn_df, val_df, 'userId', 'title', 'genres', 'rating')
test_model = DPBWithItemFeatures(n_users, n_movies, n_genres, n_factors)

In [16]:
optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005)
criterion = nn.MSELoss()

In [17]:
train_pytorch_model(test_model, trn_dl, optimizer, criterion, epochs=10)

  genre_idxs = torch.tensor(item[3:], dtype=torch.long)


Epoch [1/10], Loss: 1.0116
Epoch [2/10], Loss: 0.6965
Epoch [3/10], Loss: 0.3993
Epoch [4/10], Loss: 0.2266
Epoch [5/10], Loss: 0.2437
Epoch [6/10], Loss: 0.1276
Epoch [7/10], Loss: 0.2102
Epoch [8/10], Loss: 0.1971
Epoch [9/10], Loss: 0.1270
Epoch [10/10], Loss: 0.1238


In [18]:
def calculate_mse(model, data_loader, criterion):
    model.eval()
    total_loss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            targets = batch[:, 2]
            outputs = model(batch)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * len(batch)
            total_samples += len(batch)

    mse = total_loss / total_samples
    return mse

In [19]:
val_mse = calculate_mse(test_model, val_dl, criterion)
print(f"Validation MSE: {val_mse:.4f}")

  genre_idxs = torch.tensor(item[3:], dtype=torch.long)


Validation MSE: 1.5849


In [21]:
# Get + sort genre biases, re-use of code above for inspecting item biases

genre_bias_embeddings = test_model.genre_bias.weight.squeeze().detach().numpy()

# Map back to their original genre ids/names
genre_biases = [(genre_id, genre_bias_embeddings[genre_idx]) for genre_id, genre_idx in trn_ds.genre_mapping.items()]

sorted_genre_biases = sorted(genre_biases, key=lambda x: x[1], reverse=True)

num_top_items = 10
top_genres_with_bias = sorted_genre_biases[:num_top_items]
print("Genres with highest bias:")
for genre_id, bias_value in top_genres_with_bias:
  print(f"Genre: {genre_id}, Bias Value: {bias_value}")

Genres with highest bias:
Genre: Documentary, Bias Value: 0.6345683932304382
Genre: (no genres listed), Bias Value: 0.36528003215789795
Genre: Drama, Bias Value: 0.35985830426216125
Genre: Film-Noir, Bias Value: 0.3524704575538635
Genre: Animation, Bias Value: 0.3234733045101166
Genre: War, Bias Value: 0.2525827884674072
Genre: Western, Bias Value: 0.25197747349739075
Genre: Crime, Bias Value: 0.20416249334812164
Genre: Mystery, Bias Value: 0.17719349265098572
Genre: Musical, Bias Value: 0.16158488392829895


In [22]:
# Inspect genre embedding similarities via cosine similarity

def calculate_genre_similarity(model):
  model.eval()

  genre_embeddings = model.genre_factors.weight  # shape: (num_genres, embedding_size)

  # Normalize embeddings to unit length
  genre_norms = torch.norm(genre_embeddings, dim=1, keepdim=True)  # shape: (num_items, 1)
  genre_embeddings_normalized = genre_embeddings / genre_norms

  # Calculate cosine similarities between all pairs of item embeddings
  cosine_similarities = torch.matmul(genre_embeddings_normalized, genre_embeddings_normalized.T)  # shape: (num_items, num_items)

  return cosine_similarities

cosine_sims = calculate_genre_similarity(test_model)
print(cosine_sims)

tensor([[ 1.0000,  0.7889,  0.4872,  0.4474,  0.3936,  0.3485,  0.8915,  0.7624,
          0.5958,  0.1909,  0.6946,  0.7559,  0.8436,  0.4353,  0.7580,  0.6281,
          0.4470,  0.4450,  0.5169,  0.6176],
        [ 0.7889,  1.0000,  0.5852,  0.4227,  0.3263,  0.2702,  0.7929,  0.4924,
          0.5677,  0.1674,  0.6302,  0.6135,  0.7551,  0.5137,  0.5854,  0.6482,
          0.4749,  0.4880,  0.4060,  0.4449],
        [ 0.4872,  0.5852,  1.0000,  0.1504,  0.1763,  0.3498,  0.4347,  0.2886,
          0.0975,  0.0654,  0.3293,  0.3131,  0.4524,  0.3161,  0.2768,  0.2129,
          0.3873,  0.2798,  0.1961,  0.2714],
        [ 0.4474,  0.4227,  0.1504,  1.0000,  0.2321,  0.1605,  0.3502,  0.1360,
          0.2575, -0.0447,  0.4125,  0.2598,  0.2517,  0.3275,  0.4823,  0.3878,
          0.2976,  0.2543,  0.3103,  0.4846],
        [ 0.3936,  0.3263,  0.1763,  0.2321,  1.0000,  0.1912,  0.4368,  0.2868,
         -0.0029,  0.0319,  0.3156,  0.3525,  0.3241,  0.1977,  0.2288,  0.2979,
      

In [24]:
# Choose genre to inspect cosine similarities
base_genre_id = 'Crime'
genre_idx = trn_ds.genre_mapping[base_genre_id]

similar_genres = torch.argsort(cosine_sims[genre_idx], descending=True)

# Print top 5 similar genres
top_k = 5
for i in range(1, top_k + 1):  # Skip the first genre (itself)
    similar_genre_idx = similar_genres[i].item()
    similar_genre_id = next(key for key, val in trn_ds.genre_mapping.items() if val == similar_genre_idx)

    similarity_score = cosine_sims[genre_idx, similar_genre_idx].item()
    print(f"Genre: {base_genre_id} is similar to genre: {similar_genre_id} with similarity score: {similarity_score:.4f}")

Genre: Crime is similar to genre: Mystery with similarity score: 0.4846
Genre: Crime is similar to genre: Horror with similarity score: 0.4823
Genre: Crime is similar to genre: Thriller with similarity score: 0.4474
Genre: Crime is similar to genre: Documentary with similarity score: 0.4227
Genre: Crime is similar to genre: (no genres listed) with similarity score: 0.4125
