In [19]:
import pandas as pd 
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## Encode Train and Validation Data

In [13]:
# load encoded data
PATH = './proc_data/'
train = pd.read_csv(PATH+"train.csv.gz")
val = pd.read_csv(PATH+"val.csv.gz")

In [20]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [21]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "movie_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [22]:
train_enc = encode_data(train)

In [32]:
train_enc.head()

Unnamed: 0,user_id,movie_id,rating
0,0,0,3
1,1,0,5
2,2,0,1
3,3,0,3
4,4,0,4


In [23]:
val_enc = encode_data(val, train)

In [33]:
val_enc.head()

Unnamed: 0,user_id,movie_id,rating
0,10669,0,4
2,57243,0,4
3,21505,0,4
4,10201,0,4
5,30056,0,5


Load with torch

In [34]:
batch_size = 50
train_loader = DataLoader(train_enc, batch_size=batch_size, shuffle=True)
# for test we use shuffle=False
test_loader = DataLoader(val_enc, batch_size=batch_size, shuffle=False) 

## Base Model - Matrix factorization model

In [53]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

In [54]:
num_users = len(train['user_id'].unique())
num_items = len(train['movie_id'].unique())
print(num_users, num_items) 

232624 17454


In [62]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_enc['user_id'].values)  #.cuda()
        items = torch.LongTensor(train_enc['movie_id'].values) #.cuda()
        ratings = torch.FloatTensor(train_enc['rating'].values)  #.cuda()
        if unsqueeze: # adding an extra dimension by reshaping to ensure y hat and y are the same dimensions to calculate loss function
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('loss after epoch %d: %.3f' %((i+1), loss.item())) # used to be loss.data[0]
    test_loss(model, unsqueeze)

In [63]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(val_enc['user_id'].values) # .cuda()
    items = torch.LongTensor(val_enc['movie_id'].values) #.cuda()
    ratings = torch.FloatTensor(val_enc['rating'].values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [59]:
model = MF(num_users, num_items, emb_size=100)

In [58]:
train_epocs(model, epochs=10, lr=0.1)

loss after epoch 1: 14.152
loss after epoch 2: 6.686
loss after epoch 3: 1.685
loss after epoch 4: 5.869
loss after epoch 5: 1.596
loss after epoch 6: 1.497
loss after epoch 7: 3.145
loss after epoch 8: 3.583
loss after epoch 9: 2.653
loss after epoch 10: 1.329
test loss 2.225 


In [None]:
model = MF(num_users, num_items, emb_size=100)

In [64]:
train_epocs(model, epochs=10, lr=0.05)

loss after epoch 1: 14.152
loss after epoch 2: 11.410
loss after epoch 3: 6.662
loss after epoch 4: 2.419
loss after epoch 5: 1.648
loss after epoch 6: 4.180
loss after epoch 7: 3.971
loss after epoch 8: 2.136
loss after epoch 9: 0.995
loss after epoch 10: 1.080
test loss 1.772 


In [67]:
model = MF(num_users, num_items, emb_size=100)

In [68]:
train_epocs(model, epochs=15, lr=0.01)

loss after epoch 1: 14.155
loss after epoch 2: 14.035
loss after epoch 3: 13.734
loss after epoch 4: 12.867
loss after epoch 5: 11.892
loss after epoch 6: 11.262
loss after epoch 7: 10.322
loss after epoch 8: 9.064
loss after epoch 9: 8.184
loss after epoch 10: 7.156
loss after epoch 11: 5.965
loss after epoch 12: 4.935
loss after epoch 13: 3.942
loss after epoch 14: 3.010
loss after epoch 15: 2.256
test loss 2.172 


## MF with bias

In [69]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [72]:
model_bias = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [73]:
train_epocs(model_bias, epochs=20, lr=0.01, wd=1e-5)

loss after epoch 1: 14.155
loss after epoch 2: 13.912
loss after epoch 3: 13.563
loss after epoch 4: 12.325
loss after epoch 5: 11.598
loss after epoch 6: 10.790
loss after epoch 7: 9.655
loss after epoch 8: 8.630
loss after epoch 9: 7.738
loss after epoch 10: 6.433
loss after epoch 11: 5.541
loss after epoch 12: 4.512
loss after epoch 13: 3.595
loss after epoch 14: 2.795
loss after epoch 15: 2.177
loss after epoch 16: 1.726
loss after epoch 17: 1.469
loss after epoch 18: 1.396
loss after epoch 19: 1.442
loss after epoch 20: 1.560
test loss 2.581 


## Neural Network Model

In [74]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

In [75]:
model_nn = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [77]:
train_epocs(model_nn, epochs=20, lr=0.01, wd=1e-5, unsqueeze=True) 

loss after epoch 1: 15.206
loss after epoch 2: 12.686
loss after epoch 3: 10.734
loss after epoch 4: 8.810
loss after epoch 5: 7.016
loss after epoch 6: 5.400
loss after epoch 7: 3.976
loss after epoch 8: 2.899
loss after epoch 9: 2.206
loss after epoch 10: 1.941
loss after epoch 11: 2.083
loss after epoch 12: 2.478
loss after epoch 13: 2.904
loss after epoch 14: 3.169
loss after epoch 15: 3.198
loss after epoch 16: 3.025
loss after epoch 17: 2.737
loss after epoch 18: 2.428
loss after epoch 19: 2.162
loss after epoch 20: 1.979
test loss 1.227 
