<a href="https://colab.research.google.com/github/cchummer/ml-dl-scratch/blob/main/collab_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [3]:
ratings_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Preprocessing - DataLoaders

In [4]:
# Grab movie names from other csv
names_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML+DL/movielens_movies.csv')
names_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# Merge into ratings df
ratings_df = ratings_df.merge(names_df, on='movieId')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [6]:
from sklearn.model_selection import train_test_split

#np.random.seed(42)
trn_df,val_df = train_test_split(ratings_df, test_size=0.25)

In [7]:
trn_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
28435,221,104,2.5,1111176006,Happy Gilmore (1996),Comedy
85883,82,6586,3.5,1084466393,American Wedding (American Pie 3) (2003),Comedy
11414,597,2116,4.0,941640505,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy
34644,275,2671,5.0,1049076756,Notting Hill (1999),Comedy|Romance
80012,380,65514,5.0,1494696245,Ip Man (2008),Action|Drama|War
...,...,...,...,...,...,...
44020,514,1721,4.0,1533954277,Titanic (1997),Drama|Romance
63780,312,3476,4.0,1043176096,Jacob's Ladder (1990),Horror|Mystery
61349,561,2294,2.0,1491094477,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
18620,263,2424,4.0,941591986,You've Got Mail (1998),Comedy|Romance


In [7]:
n_users = len(ratings_df.userId.unique())
n_movies = len(ratings_df.movieId.unique())
n_factors = 50

print(n_users)
print(n_movies)

610
9724


In [9]:
# Utilizing pytorch DataSet + DataLoader functionality for easy batch size control + ID mapping
class myCollabDataSet(Dataset):
  def __init__(self, df, user_col, item_col, score_col):

    # Create mappings of user and item ids to 0-based indices so they will play nicely with embedding lookups. Also allows for use of non-numeric columns (movie title, etc)
    self.user_col = user_col
    self.item_col = item_col
    self.score_col = score_col

    self.data = df
    self.user_mapping = {user_id: i for i, user_id in enumerate(self.data[self.user_col].unique())}
    self.item_mapping = {item_id: i for i, item_id in enumerate(self.data[self.item_col].unique())}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    user_id = self.data.iloc[idx][self.user_col]
    item_id = self.data.iloc[idx][self.item_col]
    rating = self.data.iloc[idx][self.score_col]

    # Return our index values rather than the raw ID's (which are potentially non-numeric)
    # This has the effect of indices rather than ID's being returned to forward() in the model below
    user_idx = self.user_mapping[user_id]
    item_idx = self.item_mapping[item_id]
    user_idx = torch.tensor(user_idx, dtype=torch.long)
    item_idx = torch.tensor(item_idx, dtype=torch.long)

    return torch.tensor([user_idx, item_idx, rating], dtype=torch.float32)

In [13]:
# Quick function to turn dataframes into optimized DataLoaders
def create_data_loaders(trn_df, val_df, user_col, item_col, score_col, batch_size=64):
  '''
  user_col: str name of column in dataframe containing user ids
  item_col: str name of column in dataframe containing item ids
  score_col: str name of column in dataframe containing ratings
  '''
  trn_ds = myCollabDataSet(trn_df, user_col, item_col, score_col)
  val_ds = myCollabDataSet(val_df, user_col, item_col, score_col)

  trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

  return trn_dl, val_dl, trn_ds, val_ds

In [11]:
# https://github.com/fastai/fastai/blob/master/fastai/layers.py#L100
def sigmoid_range(x, low, high):
    '''Sigmoid function with range `(low, high)`'''
    return torch.sigmoid(x) * (high - low) + low

In [12]:
# Define our model
class DotProductBias(nn.Module):
    def __init__(self, n_users, n_items, n_factors, y_range=(0,5.5)):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_factors = nn.Embedding(n_items, n_factors)
        self.item_bias = nn.Embedding(n_items, 1)
        self.y_range = y_range

    def forward(self, x):

        user_idx = x[:, 0].long()
        item_idx = x[:, 1].long()

        users = self.user_factors(user_idx)
        items = self.item_factors(item_idx)

        res = (users * items).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0].long()) + self.item_bias(x[:,1].long())
        return sigmoid_range(res, *self.y_range)

In [14]:
trn_dl, val_dl, trn_ds, val_ds = create_data_loaders(trn_df, val_df, 'userId', 'title', 'rating')
test_model = DotProductBias(n_users, n_movies, n_factors)

In [15]:
def train_pytorch_model(model, train_loader, optimizer, criterion, epochs=5):
  for epoch in range(epochs):

    model.train()
    running_loss = 0.0
    for batch in train_loader:

      target = batch[:, 2].unsqueeze(1) # testing unsqueeze
      optimizer.zero_grad()

      output = model(batch)
      loss = criterion(output, target)

      loss.backward()
      optimizer.step()

      running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

In [23]:
optimizer = torch.optim.Adam(test_model.parameters(), lr=0.005, weight_decay=0.1)
criterion = nn.MSELoss()

In [24]:
train_pytorch_model(test_model, trn_dl, optimizer, criterion)

Epoch 1, Loss: 1.4623701878005477
Epoch 2, Loss: 1.4607103387313043
Epoch 3, Loss: 1.4599942600263154
Epoch 4, Loss: 1.4601926227713196
Epoch 5, Loss: 1.460554000437764


In [25]:
# Extract item bias embeddings from the model
item_bias_embeddings = test_model.item_bias.weight.squeeze().detach().numpy()

# Map the item bias embeddings back to their original item IDs
item_biases = [(item_id, item_bias_embeddings[item_idx]) for item_id, item_idx in trn_ds.item_mapping.items()]

# Sort items based on bias values
sorted_item_biases = sorted(item_biases, key=lambda x: x[1], reverse=True)

# Output the items with the highest bias
num_top_items = 10  # Adjust as needed
top_items_with_bias = sorted_item_biases[:num_top_items]
print("Top items with highest bias:")
for item_id, bias_value in top_items_with_bias:
  print(f"Item ID: {item_id}, Bias Value: {bias_value}")

Top items with highest bias:
Item ID: Matrix, The (1999), Bias Value: 0.14043059945106506
Item ID: Shawshank Redemption, The (1994), Bias Value: 0.1370498687028885
Item ID: Forrest Gump (1994), Bias Value: 0.13639456033706665
Item ID: Star Wars: Episode V - The Empire Strikes Back (1980), Bias Value: 0.12206882983446121
Item ID: Pulp Fiction (1994), Bias Value: 0.10089034587144852
Item ID: The Night Before (2015), Bias Value: 0.09882864356040955
Item ID: Star Wars: Episode IV - A New Hope (1977), Bias Value: 0.08970396965742111
Item ID: Silence of the Lambs, The (1991), Bias Value: 0.0855785384774208
Item ID: American Beauty (1999), Bias Value: 0.0838729664683342
Item ID: Green Mile, The (1999), Bias Value: 0.08351507037878036


In [26]:
def test_model_preds(model, val_dl):
  model.eval()

  with torch.no_grad():
    for batch in val_dl:

      user_idx = batch[:, 0].long()
      item_idx = batch[:, 1].long()
      target = batch[:, 2].float()

      predictions = model(batch).squeeze()
      for pred, actual in zip(predictions, target):
        print(f"Predicted: {pred.item()}, Actual: {actual.item()}")

In [None]:
test_model_preds(test_model, trn_dl)

In [13]:
from fastai.learner import Learner
from fastai.callback.schedule import fit_one_cycle
from fastai.data.core import DataLoaders

In [None]:
both_dls = DataLoaders(trn_dl, val_dl)

learn = Learner(both_dls, test_model, loss_func=nn.MSELoss())

learn.fit_one_cycle(5, 5e-3)