Parameters in the model
Matrix R: containing the ratings 
Matrix U: Containing the embeddings of the users
Matrix M: Containing the embedding of the movies


In [1]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from scipy.sparse import rand as sprand
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils import data


In [2]:
# The grand dataset
ratings_dataset = pd.read_csv('ratings_small.csv', usecols = ['userId','movieId','rating'])

#ratings

In [3]:
# Creating new indices

def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [4]:
def encode_data(ratings_dataset, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    ratings_dataset = ratings_dataset.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(ratings_dataset[col_name], train_col)
        ratings_dataset[col_name] = col
        ratings_dataset = ratings_dataset[ratings_dataset[col_name] >= 0]
    return ratings_dataset

In [5]:
ratings_dataset_e=encode_data(ratings_dataset)

In [6]:
# split train and validation before encoding
trn_len = ratings_dataset_e.shape[0]- int(0.3*ratings_dataset_e.shape[0])
val_len = int(0.2*ratings_dataset_e.shape[0])
test_len = int(0.1*ratings_dataset_e.shape[0])

train = ratings_dataset_e[:trn_len-1].copy()
val = ratings_dataset_e[trn_len:-test_len].copy()
test = ratings_dataset_e[trn_len:-val_len].copy()

print(len(train))
print(len(val))
print(len(test))

70002
20001
10001


In [7]:
train.shape[0]+ val.shape[0]+ test.shape[0] == ratings_dataset_e.shape[0]

True

In [8]:
ratings_dataset_e.max

<bound method DataFrame.max of         userId  movieId  rating
0            0        0     2.5
1            0        1     3.0
2            0        2     3.0
3            0        3     2.0
4            0        4     4.0
5            0        5     2.0
6            0        6     2.0
7            0        7     2.0
8            0        8     3.5
9            0        9     2.0
10           0       10     2.5
11           0       11     1.0
12           0       12     4.0
13           0       13     4.0
14           0       14     3.0
15           0       15     2.0
16           0       16     2.0
17           0       17     2.5
18           0       18     1.0
19           0       19     3.0
20           1       20     4.0
21           1       21     5.0
22           1       22     5.0
23           1       23     4.0
24           1       24     4.0
25           1       25     3.0
26           1       26     3.0
27           1       27     4.0
28           1       28     3.0
29       

In [9]:
import torch
from torch.utils import data

# Formatting the data 

class MovieLens(Dataset):
  #'Characterizes a dataset for PyTorch'
    def __init__(self,users,movies,ratings):
        'Initialization'
        self.movies=movies
        self.users=users
        self.ratings=ratings
        
    def __len__(self):
     #   'Denotes the total number of samples'
        return len(self.ratings)

    def __getitem__(self, index):
     #   'Generates one sample of data'
        # Select sample
        
        U = self.users[index]
        V =self.movies[index]
        y = self.ratings[index]
        #obs = {'movieId':movieId,'userId':userId,'rating':rating}
        #obs = self.movieLens.drop('timestamp',axis=1)
        #obs = obs.iloc[index,:].as_matrix()

        return [U,V,y]
    

In [10]:
users = torch.LongTensor(train.userId.values)
movies = torch.LongTensor(train.movieId.values)
ratings = torch.FloatTensor(train.rating.values)

train_dataset = MovieLens(users, movies, ratings)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)

In [11]:
#Valid loader
users_val = torch.LongTensor(val.userId.values)
movies_val = torch.LongTensor(val.movieId.values)
ratings_val = torch.FloatTensor(val.rating.values)

val_dataset = MovieLens(users_val, movies_val, ratings_val)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=True)



# test loader
users_test = torch.LongTensor(test.userId.values)
movies_test = torch.LongTensor(test.movieId.values)
ratings_test = torch.FloatTensor(test.rating.values)

test_dataset = MovieLens(users_test, movies_test, ratings_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)




In [12]:
# Printing one batch from the train dataloader
t1= iter(train_loader)
next(t1)


[tensor([ 164,   14,  176,  471,  400,  430,   47,  325,  184,  293,
          354,  214,  129,  262,  303,  249,  274,  401,  405,  357,
          208,  370,  202,  455,  366,  251,  101,  104,  291,  410,
          305,  110,  467,  270,  295,  118,   14,  426,   21,  241,
          314,  246,   54,  375,  239,  193,  381,   72,  383,   22,
          459,   59,  256,  220,  440,  127,  426,  299,  343,  462,
           82,  433,   14,  211,  413,  384,  422,  176,  262,  229,
          184,  109,  430,  152,  304,  261,  310,  198,  368,  479,
           55,  318,  479,  293,  480,  383,  451,  422,  252,   20,
           11,  310,  129,  239,  452,  189,  219,   72,   76,   98]),
 tensor([ 2128,   630,   798,   700,   581,   224,  3182,    96,   435,
          4645,   732,   435,   240,  4204,   323,  1512,  1757,  4749,
          4168,    81,   672,  1516,   160,   202,  1020,   417,   953,
           362,   495,    79,  2073,    23,   870,  2075,   313,  3657,
           773,  707

In [13]:
class Model(nn.Module):
    def __init__(self, num_user, num_movie, emb_size=100):
        super(Model, self).__init__()
        self.userEmb = nn.Embedding(num_user, emb_size)
        self.movieEmb = nn.Embedding(num_movie, emb_size)
        
        
    def forward(self, u, v):
        U = self.userEmb(u)
        V = self.movieEmb(v)
        return (U*V).sum(1)

In [14]:
num_user = len(ratings_dataset.userId.unique()) 
num_movie = len(ratings_dataset.movieId.unique())

In [15]:
emb_size = 100
model=Model(num_user,num_movie,emb_size)
model.parameters()

<generator object Module.parameters at 0x106d734c0>

In [16]:
[p for p in model.parameters()]

[Parameter containing:
 tensor([[-8.3762e-01,  4.0872e-01, -1.1977e+00,  ...,  7.5657e-01,
           7.2198e-01, -1.9205e+00],
         [ 6.3992e-02, -5.2978e-02,  9.9496e-01,  ...,  2.3100e-01,
          -4.1503e-01,  2.0637e+00],
         [ 5.2997e-01,  6.1431e-01, -3.1933e-01,  ..., -1.1856e+00,
          -1.2023e+00, -2.7081e+00],
         ...,
         [ 1.1961e+00, -1.2884e-01,  1.6270e+00,  ..., -3.0326e-01,
           1.2082e-01,  7.7062e-02],
         [-1.6689e+00, -1.0780e-01,  7.0758e-02,  ..., -1.4046e+00,
           1.0838e+00, -1.2293e+00],
         [ 3.0049e-01, -8.4128e-01,  1.2243e+00,  ...,  1.7267e+00,
          -3.7154e-01, -5.2564e-02]]), Parameter containing:
 tensor([[ 3.6193e-01, -4.2789e-01, -3.3172e-01,  ..., -1.5869e+00,
           6.1288e-01,  1.3990e-01],
         [-1.1924e+00, -1.0613e+00,  3.7470e-01,  ...,  4.4404e-01,
           1.4038e+00,  9.8005e-01],
         [-7.3969e-01,  1.6161e+00, -1.1826e+00,  ..., -2.9731e-01,
           1.5401e+00,  3.1059e

In [17]:
def test_loss(model):
    model.eval() # go to evaluation mode
    
    running_loss = 0.
    for j, data in enumerate(val_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
            
       
        y_hat = model(users,items)
        loss_now = F.mse_loss(y_hat, ratings)
        running_loss+= loss_now.data[0]
        
    print("validation loss", ": ", running_loss/len(val_loader)) # j means this many iterations till end

In [18]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters()) # get all parameters which need grad
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train() # into training mode
    for i in range(epochs):
        running_loss = 0.0
        
        for j, data in enumerate(train_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
        

            y_hat = model(users,items)
            loss = F.mse_loss(y_hat, ratings)
            
            #optimizer.zero_grad()
            #loss.backward()
            #optimizer.step()
            running_loss += loss.data[0]
        print("training loss for epoch ",i+1, ": ", running_loss/j+1) # used to be loss.data[0]
    test_loss(model)

In [19]:
train_epocs(model, epochs=10, lr=0.01)



training loss for epoch  1 :  tensor(116.0545)
training loss for epoch  2 :  tensor(116.2166)
training loss for epoch  3 :  tensor(116.0447)
training loss for epoch  4 :  tensor(116.0785)
training loss for epoch  5 :  tensor(116.2470)
training loss for epoch  6 :  tensor(116.7777)
training loss for epoch  7 :  tensor(116.4843)
training loss for epoch  8 :  tensor(116.8681)
training loss for epoch  9 :  tensor(116.1272)
training loss for epoch  10 :  tensor(116.1020)


  


validation loss :  tensor(115.1415)


training loss for epoch  1 :  tensor(79.6216)
training loss for epoch  2 :  tensor(16.7696)
validation loss :  tensor(98.1841)
