In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/.

# MovieLens Dataset

In [2]:
PATH = Path("ml-latest-small")
list(PATH.iterdir())

[PosixPath('ml-latest-small/links.csv'),
 PosixPath('ml-latest-small/tags.csv'),
 PosixPath('ml-latest-small/ratings.csv'),
 PosixPath('ml-latest-small/README.txt'),
 PosixPath('ml-latest-small/movies.csv')]

In [3]:
data = pd.read_csv(PATH/"ratings.csv")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
#Split train and validation set
cutoff = np.quantile(data.timestamp.values, 0.8)
print(cutoff)
df_train = data[data.timestamp<cutoff].copy()
df_val = data[data.timestamp>=cutoff].copy()

1458635171.0


In [5]:
df_train

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
99529,609,892,3.0,847221080
99530,609,1056,3.0,847221080
99531,609,1059,3.0,847221054
99532,609,1150,4.0,847221054


# Encode Data

In [6]:
def proc_col(col):
    """
    Encodes a pandas column with values between 0 and n-1.
    where n = number of unique values
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)

In [7]:
def encode_data(df_train):
    """
    Encodes rating data with continous user and movie ids using 
    the helpful fast.ai function from above.
    
    Inputs:
      train_csv: a csv file with columns user_id,movie_id,rating 
    
    Returns:
      df: a dataframe with the encode data
      num_users
      num_movies    
    """
    num_users = proc_col(df_train.userId)[2]
    num_movies = proc_col(df_train.movieId)[2]

    df_train["userId"] = proc_col(df_train.userId)[1]
    df_train["movieId"] = proc_col(df_train.movieId)[1]
    return df_train, num_users, num_movies

In [8]:
def encode_valid(df_val, df_train):
    """ 
    Encodes df_val with the same encoding as df_train.
    
    Returns:
    df_val: dataframe with the same encoding as df_train
    """
    name2idx_user = proc_col(df_train.userId)[0]
    name2idx_movie = proc_col(df_train.movieId)[0]

    df_val["userId"] = np.array([name2idx_user.get(x,-1) for x in df_val.userId])
    df_val["movieId"] = np.array([name2idx_movie.get(x,-1) for x in df_val.movieId])
    df_val = df_val[df_val.userId!=-1][df_val.movieId!=-1]
    return df_val

In [9]:
df, num_users, num_movies = encode_data(df_train)
print(f'num_users : {num_users}')
print(f'num_movies : {num_movies}')
df

num_users : 522
num_movies : 7867


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
99529,521,328,3.0,847221080
99530,521,2314,3.0,847221080
99531,521,1103,3.0,847221054
99532,521,6257,4.0,847221054


In [10]:
df_val = encode_valid(df_val, df_train)
df_val

  df_val = df_val[df_val.userId!=-1][df_val.movieId!=-1]


Unnamed: 0,userId,movieId,rating,timestamp
1434,15,1,2.5,1510577970
1436,15,47,3.5,1510571970
1440,15,260,5.0,1510571946
1441,15,293,3.0,1510571962
1442,15,296,4.0,1510571877
...,...,...,...,...
81964,519,4447,3.5,1518478841
81965,519,4963,4.0,1518478565
81966,519,5989,4.0,1518379967
81967,519,6378,4.0,1518478826


# Embedding Layers

In [11]:
K = 5
emb_user = nn.Embedding(num_users, K)
print(emb_user.weight)
emb_movie = nn.Embedding(num_movies, K)
print(emb_movie.weight)

Parameter containing:
tensor([[ 0.4284,  1.5713,  1.4015,  0.4404, -0.7480],
        [-0.2484,  0.0766,  0.3217, -1.1611,  0.9414],
        [-1.4290,  0.1911, -0.9322, -0.2163, -0.1046],
        ...,
        [-0.8933, -0.1744, -0.5780,  0.8272, -1.1731],
        [ 0.0166,  0.6908,  0.7352,  0.3467,  0.2442],
        [-1.3345,  0.4457,  0.9240, -0.8816,  0.9973]], requires_grad=True)
Parameter containing:
tensor([[-2.0865,  0.3503,  0.5092,  0.5147,  0.5890],
        [-0.2526,  0.0429, -0.9205, -0.5692,  0.9632],
        [-1.3464, -0.6203,  0.3141,  1.7333, -0.2812],
        ...,
        [-0.3597,  0.8787,  2.0811, -0.0090,  1.2414],
        [ 0.4437, -0.7448, -0.9028, -0.1094,  2.4766],
        [-0.4268,  1.8368, -0.9168, -0.7838, -1.5734]], requires_grad=True)


# ML Model

In [12]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.5)
        self.item_emb.weight.data.uniform_(0,0.5)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

In [13]:
model = MF(num_users, num_movies, emb_size=100) 
model

MF(
  (user_emb): Embedding(522, 100)
  (item_emb): Embedding(7867, 100)
)

# Train Model

In [14]:
def train_epocs(model, df_train, df_val, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(df_train.userId.values) 
        items = torch.LongTensor(df_train.movieId.values)
        ratings = torch.FloatTensor(df_train.rating.values)
    
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model,df_val)
        print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 

In [15]:
def valid_loss(model,df_val):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) 
    items = torch.LongTensor(df_val.movieId.values) 
    ratings = torch.FloatTensor(df_val.rating.values) 
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    return loss.item()

In [16]:
train_epocs(model, df_train, df_val, epochs=20, lr=0.001, wd=1e-5)

train loss 8.627 valid loss 8.110
train loss 8.360 valid loss 7.854
train loss 8.099 valid loss 7.605
train loss 7.845 valid loss 7.362
train loss 7.597 valid loss 7.125
train loss 7.355 valid loss 6.894
train loss 7.119 valid loss 6.670
train loss 6.890 valid loss 6.452
train loss 6.667 valid loss 6.240
train loss 6.450 valid loss 6.035
train loss 6.239 valid loss 5.835
train loss 6.035 valid loss 5.642
train loss 5.836 valid loss 5.455
train loss 5.644 valid loss 5.274
train loss 5.458 valid loss 5.099
train loss 5.277 valid loss 4.930
train loss 5.103 valid loss 4.767
train loss 4.934 valid loss 4.609
train loss 4.771 valid loss 4.457
train loss 4.614 valid loss 4.311
