In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import copy
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.utils.data as data_utils

In [2]:
resource_path = os.path.join(".", 'data', 'ml-25m')
ratings_csv = os.path.join(resource_path, 'ratings.csv')
movies_csv = os.path.join(resource_path, 'movies.csv')

In [3]:
ratings = pd.read_csv(ratings_csv)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
ratings.drop(['timestamp'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [5]:
n_movies = 500
top_n_movies = ratings['movieId'].value_counts()[:n_movies].index.tolist()

In [6]:
ratings['top_n'] = ratings.movieId.apply(lambda id: id in top_n_movies)

ratings

Unnamed: 0,userId,movieId,rating,top_n
0,1,296,5.0,True
1,1,306,3.5,False
2,1,307,5.0,False
3,1,665,5.0,False
4,1,899,3.5,False
...,...,...,...,...
25000090,162541,50872,4.5,True
25000091,162541,55768,2.5,False
25000092,162541,56176,2.0,False
25000093,162541,58559,4.0,True


In [7]:
ratings = ratings[ratings['top_n'] == True].drop(['top_n'], axis=1)
ratings

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
5,1,1088,4.0
9,1,1250,4.0
11,1,1653,4.0
12,1,2011,2.5
...,...,...,...
25000084,162541,8961,4.0
25000087,162541,33794,4.0
25000088,162541,41566,4.0
25000090,162541,50872,4.5


In [8]:
user_le = LabelEncoder()
user_le.fit(ratings.userId)
userId_to_Label = user_le.transform
userLabel_to_Id = user_le.inverse_transform

movie_le = LabelEncoder()
movie_le.fit(ratings.movieId)
movieId_to_Label = movie_le.transform
movieLabel_to_Id = movie_le.inverse_transform

In [9]:
ratings['userLabel'] = userId_to_Label(ratings.userId)
ratings['movieLabel'] = movieId_to_Label(ratings.movieId)
movie_ratings = ratings

ratings.head()

Unnamed: 0,userId,movieId,rating,userLabel,movieLabel
0,1,296,5.0,0,53
5,1,1088,4.0,0,144
9,1,1250,4.0,0,182
11,1,1653,4.0,0,232
12,1,2011,2.5,0,258


In [10]:
# movies = pd.read_csv(movies_csv)
# movies.head()

In [11]:
# def genres_list(genres:str):
#     return genres.split('|')

# movies['genres'] = movies.genres.apply(genres_list)
# movies.head()

In [12]:
# movie_ratings = ratings.merge(movies)
# del ratings, movies

# movie_ratings.userId.astype(str)
# movie_ratings.movieId.astype(str)

# movie_ratings.head()

In [13]:
# movie_ratings.astype({
#     'userId': str,
#     'movieId': str
# })

In [14]:
# movie_ratings.info()

In [15]:
n_users = len(movie_ratings["userLabel"].unique())
n_movies = len(movie_ratings["movieLabel"].unique())
n_users, n_movies

(161913, 500)

In [16]:
target = torch.tensor(movie_ratings['rating'].values)[..., np.newaxis]
target

tensor([[5.0000],
        [4.0000],
        [4.0000],
        ...,
        [4.0000],
        [4.5000],
        [4.0000]], dtype=torch.float64)

In [17]:
features = torch.tensor(movie_ratings[["userLabel", "movieLabel"]].values) 
features

tensor([[     0,     53],
        [     0,    144],
        [     0,    182],
        ...,
        [161912,    431],
        [161912,    443],
        [161912,    453]])

In [18]:
dataset = data_utils.TensorDataset(features, target)
del features, target
total_obs = len(dataset)

test_size = 0.2
test_obs = int(total_obs * test_size)
train_obs = total_obs - test_obs
split = [train_obs, test_obs]

train_set, test_set = data_utils.random_split(dataset, split)
len(train_set), len(test_set)

(8755790, 2188947)

In [19]:
bs = 50
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=True)

In [20]:
class EmbeddingNet(nn.Module):

    def __init__(self, n_users, n_movies, n_factors = 10, y_range=(0.0,5.0)):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.movie_bias = nn.Embedding(n_movies, 1)
        self.y_range = y_range

        self.double()


    def sigmoid_range(self, x, low, high):
        return torch.sigmoid(x) * (high - low) + low

      
    def forward(self, user_idx, movie_idx):
        users = self.user_factors(user_idx)
        movies = self.movie_factors(movie_idx)
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(user_idx) + self.movie_bias(movie_idx)
        return self.sigmoid_range(res, self.y_range[0], self.y_range[1])

In [21]:
# training loop parameters
learning_rate = 1e-3
weight_decay = 3e-4

n_epochs = 30
best_loss = np.inf
best_weights = None

n_factors = 50
model = EmbeddingNet(n_users, n_movies, n_factors)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=10)

In [22]:
model

EmbeddingNet(
  (user_factors): Embedding(161913, 50)
  (user_bias): Embedding(161913, 1)
  (movie_factors): Embedding(500, 50)
  (movie_bias): Embedding(500, 1)
)

In [21]:
train_history = []
val_history = []
lr_history = []

best_loss = np.inf
for i in range(n_epochs):
  train_loss = 0
  n_batches = 0

  # train phase
  for features, targets in tqdm(train_loader, desc=f'Epoch {i+1}'):
      optimizer.zero_grad()
      output = model(features[:,0], features[:,1])
      loss = criterion(output, targets)
 
      loss.backward()
      optimizer.step()
      
      train_loss += loss.detach()
      n_batches += 1

  scheduler.step()
      
  train_loss = train_loss/n_batches
  train_history.append(train_loss)
  lr_history.append(scheduler.get_last_lr())
  
  # validation
  n_batches = 0
  val_loss = 0
  with torch.no_grad():
    for features, targets in test_loader:
      output = model(features[:,0], features[:,1])
      loss = criterion(output, targets)
      val_loss += loss
      n_batches += 1  
  val_loss = val_loss/n_batches
  val_history.append(val_loss)
  if (val_loss < best_loss):
    best_weights = copy.deepcopy(model.state_dict())
    best_loss = val_loss
  print("Epoch {}, train_loss: {}, val_loss: {}"    
        .format(i, train_loss.cpu().numpy(), val_loss.cpu().numpy()))

Epoch 1:   0%|          | 0/400002 [00:00<?, ?it/s]

KeyboardInterrupt: 