Parameters in the model
Matrix R: containing the ratings 
Matrix U: Containing the embeddings of the users
Matrix M: Containing the embedding of the movies


In [117]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from scipy.sparse import rand as sprand
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils import data
import math

In [2]:
# The grand dataset
ratings_dataset = pd.read_csv('ratings_small.csv', usecols = ['userId','movieId','rating'])

#ratings

In [3]:
# Creating new indices

def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [4]:
def encode_data(ratings_dataset, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    ratings_dataset = ratings_dataset.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(ratings_dataset[col_name], train_col)
        ratings_dataset[col_name] = col
        ratings_dataset = ratings_dataset[ratings_dataset[col_name] >= 0]
    return ratings_dataset

In [5]:
ratings_dataset_e=encode_data(ratings_dataset)

In [6]:
# split train and validation before encoding
trn_len = ratings_dataset_e.shape[0]- int(0.3*ratings_dataset_e.shape[0])
val_len = int(0.2*ratings_dataset_e.shape[0])
test_len = int(0.1*ratings_dataset_e.shape[0])

train = ratings_dataset_e[:trn_len-1].copy()
val = ratings_dataset_e[trn_len:-test_len].copy()
test = ratings_dataset_e[trn_len:-val_len].copy()

print(len(train))
print(len(val))
print(len(test))

70002
20001
10001


In [7]:
train.shape[0]+ val.shape[0]+ test.shape[0] == ratings_dataset_e.shape[0]

True

In [8]:
ratings_dataset_e.max

<bound method DataFrame.max of         userId  movieId  rating
0            0        0     2.5
1            0        1     3.0
2            0        2     3.0
3            0        3     2.0
4            0        4     4.0
5            0        5     2.0
6            0        6     2.0
7            0        7     2.0
8            0        8     3.5
9            0        9     2.0
10           0       10     2.5
11           0       11     1.0
12           0       12     4.0
13           0       13     4.0
14           0       14     3.0
15           0       15     2.0
16           0       16     2.0
17           0       17     2.5
18           0       18     1.0
19           0       19     3.0
20           1       20     4.0
21           1       21     5.0
22           1       22     5.0
23           1       23     4.0
24           1       24     4.0
25           1       25     3.0
26           1       26     3.0
27           1       27     4.0
28           1       28     3.0
29       

In [9]:
import torch
from torch.utils import data

# Formatting the data 

class MovieLens(Dataset):
  #'Characterizes a dataset for PyTorch'
    def __init__(self,users,movies,ratings):
        'Initialization'
        self.movies=movies
        self.users=users
        self.ratings=ratings
        
    def __len__(self):
     #   'Denotes the total number of samples'
        return len(self.ratings)

    def __getitem__(self, index):
     #   'Generates one sample of data'
        # Select sample
        
        U = self.users[index]
        V =self.movies[index]
        y = self.ratings[index]
        #obs = {'movieId':movieId,'userId':userId,'rating':rating}
        #obs = self.movieLens.drop('timestamp',axis=1)
        #obs = obs.iloc[index,:].as_matrix()

        return [U,V,y]
    

In [10]:
users = torch.LongTensor(train.userId.values)
movies = torch.LongTensor(train.movieId.values)
ratings = torch.FloatTensor(train.rating.values)

train_dataset = MovieLens(users, movies, ratings)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)

In [11]:
#Valid loader
users_val = torch.LongTensor(val.userId.values)
movies_val = torch.LongTensor(val.movieId.values)
ratings_val = torch.FloatTensor(val.rating.values)

val_dataset = MovieLens(users_val, movies_val, ratings_val)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=True)



# test loader
users_test = torch.LongTensor(test.userId.values)
movies_test = torch.LongTensor(test.movieId.values)
ratings_test = torch.FloatTensor(test.rating.values)

test_dataset = MovieLens(users_test, movies_test, ratings_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)




In [12]:
# Printing one batch from the train dataloader
t1= iter(train_loader)
next(t1)


[tensor([ 287,   41,   59,   64,  293,  204,  471,  343,  198,   29,
           22,  312,  210,  305,  305,  439,   89,  370,  383,  101,
          261,  451,  231,  460,  307,  199,   14,  175,  101,   55,
          427,  351,  125,  267,  298,  354,  460,  265,  284,  387,
          471,   72,  191,   82,  415,  404,   48,   16,  184,  158,
          305,  449,  345,  291,  467,  188,   14,  298,  452,  352,
          387,  266,  305,  467,  231,  127,  412,  387,   95,   77,
           91,  160,  291,   92,  235,  383,  467,  387,  479,  151,
          469,  247,  344,  110,  380,   94,  212,  198,  478,  343,
          293,  305,   18,  345,  293,  446,  137,   82,  456,   62]),
 tensor([   56,   106,  1289,  2990,  3385,  2397,   658,    78,   487,
          2873,  2335,  4670,  5265,   714,   535,    71,   512,  1536,
          1240,    79,  1107,  3098,    24,   151,    21,   131,   782,
          1502,   243,  2309,  1038,   344,    70,   111,  6254,   202,
           860,    6

In [146]:
class Model(nn.Module):
    def __init__(self, num_user, num_movie, emb_size=100):
        super(Model, self).__init__()
        self.userEmb = nn.Embedding(num_user, emb_size)
        self.movieEmb = nn.Embedding(num_movie, emb_size)
        
        
        
    def forward(self, u, v):
        U = self.userEmb(u)
        V = self.movieEmb(v)
        r_max=max(ratings_dataset.rating)
        r_min=min(ratings_dataset.rating)
        x = F.sigmoid((U*V).sum(1))*(r_max-r_min)+r_min
        #x = (F.sigmoid((U*V).sum(1))*(r_max-r_min)+r_min)*0+2.7  #Test for randomness 

        
        return x

In [147]:
num_user = len(ratings_dataset.userId.unique()) 
num_movie = len(ratings_dataset.movieId.unique())

In [148]:
emb_size = 100
model=Model(num_user,num_movie,emb_size)
model.parameters()

<generator object Module.parameters at 0x11f0d38e0>

In [149]:
[p for p in model.parameters()]

[Parameter containing:
 tensor([[ 9.4142e-01, -4.2587e-01, -1.2396e+00,  ..., -6.9094e-01,
          -2.2144e-01, -1.3675e+00],
         [ 1.7064e+00, -3.9091e-01,  1.7707e+00,  ...,  1.1880e-01,
          -1.3292e+00,  4.0134e-01],
         [ 2.7273e-01,  5.2757e-01,  3.0223e-01,  ..., -1.4078e+00,
          -8.8698e-01,  1.4852e+00],
         ...,
         [-3.3048e-01, -5.2376e-01,  1.5771e+00,  ...,  1.3146e+00,
          -6.9357e-01, -7.4196e-01],
         [-8.0809e-01, -2.6729e-01,  1.4588e+00,  ...,  5.1459e-01,
          -3.8066e-01,  5.6025e-01],
         [-9.0032e-02,  8.0352e-01,  5.9829e-01,  ...,  1.6233e+00,
          -9.1059e-01, -2.1023e-01]]), Parameter containing:
 tensor([[-5.5235e-01,  1.1528e+00,  7.5984e-01,  ...,  5.6182e-01,
          -1.4772e+00,  5.6303e-01],
         [-6.0080e-01,  1.3087e+00,  4.1121e-02,  ..., -2.0679e+00,
           9.1013e-01, -8.2162e-01],
         [ 1.4930e+00,  2.2936e+00,  1.4063e+00,  ..., -3.8578e-01,
          -1.0713e+00,  2.0182e

In [150]:
def test_loss(model):
    model.eval() # go to evaluation mode
    
    running_loss = 0.
    for j, data in enumerate(val_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
            
       
        y_hat = model(users,items)
        loss_now = F.mse_loss(y_hat, ratings)
        running_loss+= loss_now.item()
        
    print("validation loss", ": ", running_loss/len(val_loader)) # j means this many iterations till end

In [151]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters()) # get all parameters which need grad
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train() # into training mode
    for i in range(epochs):
        running_loss = 0.0
        
        for j, data in enumerate(train_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
        

            y_hat = model(users,items)
            loss = F.mse_loss(y_hat, ratings)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print("training loss for epoch ",i+1, ": ", running_loss/j+1) # used to be loss.data[0]
    test_loss(model)

In [152]:
train_epocs(model, epochs=10, lr=0.01)

training loss for epoch  1 :  6.996513071911676
training loss for epoch  2 :  6.0784609825270515
training loss for epoch  3 :  5.567420956066677
training loss for epoch  4 :  5.22586089606796
training loss for epoch  5 :  4.904556610839707
training loss for epoch  6 :  4.615669218472072
training loss for epoch  7 :  4.339291962385177
training loss for epoch  8 :  4.155947249276297
training loss for epoch  9 :  4.013659203563417
training loss for epoch  10 :  3.917520156928471
validation loss :  6.164130286790838


training loss for epoch  1 :  tensor(79.6216)
training loss for epoch  2 :  tensor(16.7696)
validation loss :  tensor(98.1841)


In [153]:
train_epocs(model, epochs=5, lr=0.001, wd=1e-5)

training loss for epoch  1 :  3.8320383638995033
training loss for epoch  2 :  3.6992450504643575
training loss for epoch  3 :  3.5054739369664873
training loss for epoch  4 :  3.2740686551162175
training loss for epoch  5 :  3.103051311118262
validation loss :  1.9451256730959783


In [24]:
train_epocs(model, epochs=5, lr=0.0001, wd=1e-6) # lower learning rate # lower regularization

training loss for epoch  1 :  1.1302297324527588
training loss for epoch  2 :  1.1195393176509865
training loss for epoch  3 :  1.1111403022387198
training loss for epoch  4 :  1.1041365050551082
training loss for epoch  5 :  1.0981343592330814
validation loss :  14.090488851366944
