<a href="https://colab.research.google.com/github/cchummer/ml-dl-scratch/blob/main/collab_pmf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [None]:
ratings_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Preprocessing - DataLoaders

In [None]:
# Grab movie names from other csv
names_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_movies.csv')
names_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
# Merge into ratings df
ratings_df = ratings_df.merge(names_df, on='movieId')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [None]:
from sklearn.model_selection import train_test_split

#np.random.seed(42)
trn_df,val_df = train_test_split(ratings_df, test_size=0.25)

In [None]:
trn_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
19875,240,539,3.0,849122301,Sleepless in Seattle (1993),Comedy|Drama|Romance
86066,377,900,2.5,1340344005,"American in Paris, An (1951)",Musical|Romance
8937,577,1275,4.0,945965886,Highlander (1986),Action|Adventure|Fantasy
59406,297,1407,1.0,900872213,Scream (1996),Comedy|Horror|Mystery|Thriller
78372,401,5882,5.0,1510449639,Treasure Planet (2002),Adventure|Animation|Children|Sci-Fi|IMAX
...,...,...,...,...,...,...
98284,599,7193,2.0,1519257655,"Adventures of Ford Fairlane, The (1990)",Action|Comedy
15201,103,3176,4.0,1431957558,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
37692,448,6539,3.5,1066291868,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
33352,563,783,4.0,1441846374,"Hunchback of Notre Dame, The (1996)",Animation|Children|Drama|Musical|Romance


In [None]:
n_users = len(ratings_df.userId.unique())
n_movies = len(ratings_df.movieId.unique())
n_factors = 50

print(n_users)
print(n_movies)

610
9724


In [None]:
# Utilizing pytorch DataSet + DataLoader functionality for easy batch size control + ID mapping
class myCollabDataSet(Dataset):
  def __init__(self, df, user_col, item_col, score_col):

    # Create mappings of user and item ids to 0-based indices so they will play nicely with embedding lookups. Also allows for use of non-numeric columns (movie title, etc)
    self.user_col = user_col
    self.item_col = item_col
    self.score_col = score_col

    self.data = df
    self.user_mapping = {user_id: i for i, user_id in enumerate(self.data[self.user_col].unique())}
    self.item_mapping = {item_id: i for i, item_id in enumerate(self.data[self.item_col].unique())}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    user_id = self.data.iloc[idx][self.user_col]
    item_id = self.data.iloc[idx][self.item_col]
    rating = self.data.iloc[idx][self.score_col]

    # Return our index values rather than the raw ID's (which are potentially non-numeric)
    # This has the effect of indices rather than ID's being returned to forward() in the model below
    user_idx = self.user_mapping[user_id]
    item_idx = self.item_mapping[item_id]

    #user_idx_tensor = torch.tensor(user_idx, dtype=torch.long)
    #item_idx_tensor = torch.tensor(item_idx, dtype=torch.long)
    #ratings_tensor = torch.tensor(rating, dtype=torch.float32)

    #return [user_idx, item_idx, ratings_tensor]
    return torch.tensor([user_idx, item_idx, rating], dtype=torch.float32)

In [None]:
# Quick function to turn dataframes into optimized DataLoaders
def create_data_loaders(trn_df, val_df, user_col, item_col, score_col, batch_size=64):
  '''
  user_col: str name of column in dataframe containing user ids
  item_col: str name of column in dataframe containing item ids
  score_col: str name of column in dataframe containing ratings
  '''
  trn_ds = myCollabDataSet(trn_df, user_col, item_col, score_col)
  val_ds = myCollabDataSet(val_df, user_col, item_col, score_col)

  trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

  return trn_dl, val_dl, trn_ds, val_ds

# Probabilistic Matric Factorization (dot-product based)

In [None]:
def sigmoid_range(x, low, high):
  '''
  Sigmoid function with range `(low, high)`
  https://github.com/fastai/fastai/blob/master/fastai/layers.py#L100
  '''
  return torch.sigmoid(x) * (high - low) + low

In [None]:
# Define our model
class DotProductBias(nn.Module):
  def __init__(self, n_users, n_items, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = nn.Embedding(n_users, n_factors)
    self.user_bias = nn.Embedding(n_users, 1)
    self.item_factors = nn.Embedding(n_items, n_factors)
    self.item_bias = nn.Embedding(n_items, 1)
    self.y_range = y_range

    # Initialize embeddings and biases
    nn.init.normal_(self.user_factors.weight, std=0.01)
    nn.init.normal_(self.item_factors.weight, std=0.01)
    nn.init.normal_(self.user_bias.weight, std=0.01)
    nn.init.normal_(self.item_bias.weight, std=0.01)

  def forward(self, x):

    user_idx = x[:, 0].long()
    item_idx = x[:, 1].long()
    #ratings = x[:, 2]

    users = self.user_factors(user_idx)
    items = self.item_factors(item_idx)
    users_bias = self.user_bias(user_idx).squeeze()
    items_bias = self.item_bias(item_idx).squeeze()

    dot_product = torch.sum(users * items, dim=1)
    bias = users_bias + items_bias

    prediction = dot_product + bias
    return sigmoid_range(prediction, *self.y_range)

In [None]:
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders(trn_df, val_df, 'userId', 'title', 'rating')
test_model = DotProductBias(n_users, n_movies, n_factors)

In [None]:
# test again using whole dataset just for lols
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders(ratings_df, val_df, 'userId', 'title', 'rating')
test_model = DotProductBias(n_users, n_movies, n_factors)

In [None]:
def train_pytorch_model(model, train_loader, optimizer, criterion, epochs=5):
  for epoch in range(epochs):

    model.train()
    #running_loss = 0.0
    for batch in train_loader:

      targets = batch[:, 2]

      optimizer.zero_grad()

      outputs = model(batch)
      loss = criterion(outputs, targets)

      loss.backward()
      optimizer.step()

      #running_loss += loss.item()

    #print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [None]:
#optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005, weight_decay=0.1)
optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005)
criterion = nn.MSELoss()

In [None]:
train_pytorch_model(test_model, trn_dl, optimizer, criterion)

Epoch [1/5], Loss: 0.5723
Epoch [2/5], Loss: 0.4699
Epoch [3/5], Loss: 0.2417
Epoch [4/5], Loss: 0.3395
Epoch [5/5], Loss: 0.1613


In [None]:
# Extract item bias embeddings from the model
item_bias_embeddings = test_model.item_bias.weight.squeeze().detach().numpy()

# Map the item bias embeddings back to their original item IDs
item_biases = [(item_id, item_bias_embeddings[item_idx]) for item_id, item_idx in trn_ds.item_mapping.items()]

# Sort items based on bias values
sorted_item_biases = sorted(item_biases, key=lambda x: x[1], reverse=True)

# Output the items with the highest bias
num_top_items = 10  # Adjust as needed
top_items_with_bias = sorted_item_biases[:num_top_items]
print("Top items with highest bias:")
for item_id, bias_value in top_items_with_bias:
  print(f"Item ID: {item_id}, Bias Value: {bias_value}")

Top items with highest bias:
Item ID: Shawshank Redemption, The (1994), Bias Value: 0.93940269947052
Item ID: Fight Club (1999), Bias Value: 0.8286858797073364
Item ID: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Bias Value: 0.8281322717666626
Item ID: Dark Knight, The (2008), Bias Value: 0.7702516913414001
Item ID: Godfather, The (1972), Bias Value: 0.7614440321922302
Item ID: Philadelphia Story, The (1940), Bias Value: 0.7269240617752075
Item ID: Lord of the Rings: The Two Towers, The (2002), Bias Value: 0.7254716157913208
Item ID: Star Wars: Episode IV - A New Hope (1977), Bias Value: 0.7241013646125793
Item ID: Three Billboards Outside Ebbing, Missouri (2017), Bias Value: 0.7037439346313477
Item ID: One Flew Over the Cuckoo's Nest (1975), Bias Value: 0.7022022604942322


In [None]:
def test_model_preds(model, val_dl):
  model.eval()

  with torch.no_grad():
    for batch in val_dl:

      target = batch[:, 2].numpy()

      predictions = model(batch).numpy()
      for pred, actual in zip(predictions, target):
        print(f"Predicted: {pred.item():.2f}, Actual: {actual.item()}")

In [None]:
test_model_preds(test_model, trn_dl)