<a href="https://colab.research.google.com/github/cchummer/ml-dl-scratch/blob/main/collab_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [3]:
ratings_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Preprocessing - DataLoaders

In [4]:
# Grab movie names from other csv
names_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_movies.csv')
names_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# Merge into ratings df
ratings_df = ratings_df.merge(names_df, on='movieId')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [6]:
from sklearn.model_selection import train_test_split

#np.random.seed(42)
trn_df,val_df = train_test_split(ratings_df, test_size=0.25)

In [None]:
trn_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
71869,28,3107,2.0,1242290922,Backdraft (1991),Action|Drama
14550,95,2985,4.0,1105401143,RoboCop (1987),Action|Crime|Drama|Sci-Fi|Thriller
26934,607,589,4.0,963078960,Terminator 2: Judgment Day (1991),Action|Sci-Fi
10272,483,1732,5.0,1178293982,"Big Lebowski, The (1998)",Comedy|Crime
46724,495,7438,5.0,1458635650,Kill Bill: Vol. 2 (2004),Action|Drama|Thriller
...,...,...,...,...,...,...
68898,305,160565,4.5,1494085428,The Purge: Election Year (2016),Action|Horror|Sci-Fi
28881,534,160,3.5,1459788743,Congo (1995),Action|Adventure|Mystery|Sci-Fi
80530,572,2289,3.0,979923863,"Player, The (1992)",Comedy|Crime|Drama
94080,292,26322,3.0,1323631597,Gone in 60 Seconds (1974),Action|Crime|Drama


In [7]:
n_users = len(ratings_df.userId.unique())
n_movies = len(ratings_df.movieId.unique())
n_factors = 50

print(n_users)
print(n_movies)

610
9724


In [8]:
# Utilizing pytorch DataSet + DataLoader functionality for easy batch size control + ID mapping
class myCollabDataSet(Dataset):
  def __init__(self, df, user_col, item_col, score_col):

    # Create mappings of user and item ids to 0-based indices so they will play nicely with embedding lookups. Also allows for use of non-numeric columns (movie title, etc)
    self.user_col = user_col
    self.item_col = item_col
    self.score_col = score_col

    self.data = df
    self.user_mapping = {user_id: i for i, user_id in enumerate(self.data[self.user_col].unique())}
    self.item_mapping = {item_id: i for i, item_id in enumerate(self.data[self.item_col].unique())}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    user_id = self.data.iloc[idx][self.user_col]
    item_id = self.data.iloc[idx][self.item_col]
    rating = self.data.iloc[idx][self.score_col]

    # Return our index values rather than the raw ID's (which are potentially non-numeric)
    # This has the effect of indices rather than ID's being returned to forward() in the model below
    user_idx = self.user_mapping[user_id]
    item_idx = self.item_mapping[item_id]

    #user_idx_tensor = torch.tensor(user_idx, dtype=torch.long)
    #item_idx_tensor = torch.tensor(item_idx, dtype=torch.long)
    #ratings_tensor = torch.tensor(rating, dtype=torch.float32)

    #return [user_idx, item_idx, ratings_tensor]
    return torch.tensor([user_idx, item_idx, rating], dtype=torch.float32)

In [11]:
# Quick function to turn dataframes into optimized DataLoaders
def create_collab_data_loaders(trn_df, val_df, user_col, item_col, score_col, batch_size=64):
  '''
  user_col: str name of column in dataframe containing user ids
  item_col: str name of column in dataframe containing item ids
  score_col: str name of column in dataframe containing ratings
  '''
  trn_ds = myCollabDataSet(trn_df, user_col, item_col, score_col)
  val_ds = myCollabDataSet(val_df, user_col, item_col, score_col)

  trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

  return trn_dl, val_dl, trn_ds, val_ds

In [12]:
trn_dl, val_dl, trn_ds, val_ds = create_collab_data_loaders(trn_df, val_df, 'userId', 'movieId', 'rating')

# The Model

In [13]:
def get_emb_size(n_cat):
  '''
  Quickly calculate number of factors for embedding layer of the given column in the dataframe
  https://github.com/fastai/fastai/blob/master/fastai/tabular/model.py#L12
  '''

  n_factors = min(600, round(1.6 * n_cat**0.56))
  return int(n_factors)

In [14]:
def sigmoid_range(x, low, high):
  '''
  Sigmoid function with range `(low, high)`
  https://github.com/fastai/fastai/blob/master/fastai/layers.py#L100
  '''
  return torch.sigmoid(x) * (high - low) + low

In [16]:
class CollaborativeFilteringModel(nn.Module):
  def __init__(self, user_size, item_size, hidden_dim=128, y_range=(0,5.5)):
    super(CollaborativeFilteringModel, self).__init__()
    self.user_embedding = nn.Embedding(*user_size)
    self.item_embedding = nn.Embedding(*item_size)

    self.fc_layers = nn.Sequential(
      nn.Linear(user_size[1] + item_size[1], hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, 1)  # Output is a single rating prediction
      )

    self.y_range = y_range

  def forward(self, x):

    user_embedded = self.user_embedding(x[:, 0].long())
    item_embedded = self.item_embedding(x[:, 1].long())

    # Concatenate user and item embeddings
    embedded = torch.cat([user_embedded, item_embedded], dim=1)

    # Pass through layers
    output = self.fc_layers(embedded)

    # Scale to y_range
    output = sigmoid_range(output, *self.y_range)

    return output.squeeze()

In [21]:
# Training loop
def train_model(model, dataloader, optimizer, criterion, epochs):
  model.train()
  for epoch in range(epochs):
    for batch in dataloader:

      user_indices = batch[:, 0].long()
      item_indices = batch[:, 1].long()
      ratings = batch[:, 2]

      # Forward pass
      predictions = model(batch)

      # Compute loss
      loss = criterion(predictions, ratings.float())

      # Zero gradients, backward pass, and optimize
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [22]:
model = CollaborativeFilteringModel((n_users, get_emb_size(n_users)), (n_movies, get_emb_size(n_movies)))

In [23]:
# Loss function
criterion = nn.MSELoss()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [24]:
train_model(model, trn_dl, optimizer, criterion, epochs=15)

Epoch [1/15], Loss: 0.5588
Epoch [2/15], Loss: 0.7721
Epoch [3/15], Loss: 0.8205
Epoch [4/15], Loss: 0.8501
Epoch [5/15], Loss: 0.6977
Epoch [6/15], Loss: 0.4671
Epoch [7/15], Loss: 0.5701
Epoch [8/15], Loss: 0.4121
Epoch [9/15], Loss: 0.4169
Epoch [10/15], Loss: 0.2343
Epoch [11/15], Loss: 0.3941
Epoch [12/15], Loss: 0.5923
Epoch [13/15], Loss: 0.3335
Epoch [14/15], Loss: 0.1751
Epoch [15/15], Loss: 0.1895


In [27]:
def test_model_preds(model, val_dl):
  model.eval()

  with torch.no_grad():
    for batch in val_dl:

      target = batch[:, 2].numpy()

      predictions = model(batch).numpy()
      for pred, actual in zip(predictions, target):
        print(f"Predicted: {pred.item():.2f}, Actual: {actual.item()}")

In [None]:
test_model_preds(model, trn_dl)