Parameters in the model
Matrix R: containing the ratings 
Matrix U: Containing the embeddings of the users
Matrix M: Containing the embedding of the movies


In [19]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from scipy.sparse import rand as sprand
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils import data


In [20]:
# The grand dataset
ratings_dataset = pd.read_csv('ratings_small.csv', usecols = ['userId','movieId','rating'])

#ratings

In [21]:
# Creating new indices

def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [22]:
def encode_data(ratings_dataset, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    ratings_dataset = ratings_dataset.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(ratings_dataset[col_name], train_col)
        ratings_dataset[col_name] = col
        ratings_dataset = ratings_dataset[ratings_dataset[col_name] >= 0]
    return ratings_dataset

In [23]:
ratings_dataset_e=encode_data(ratings_dataset)

In [24]:
# split train and validation before encoding
trn_len = ratings_dataset_e.shape[0]- int(0.3*ratings_dataset_e.shape[0])
val_len = int(0.2*ratings_dataset_e.shape[0])
test_len = int(0.1*ratings_dataset_e.shape[0])

train = ratings_dataset_e[:trn_len-1].copy()
val = ratings_dataset_e[trn_len:-test_len].copy()
test = ratings_dataset_e[trn_len:-val_len].copy()

print(len(train))
print(len(val))
print(len(test))

70002
20001
10001


In [25]:
train.shape[0]+ val.shape[0]+ test.shape[0] == ratings_dataset_e.shape[0]

True

In [26]:
import torch
from torch.utils import data

# Formatting the data 

class MovieLens(Dataset):
  #'Characterizes a dataset for PyTorch'
    def __init__(self,users,movies,ratings):
        'Initialization'
        self.movies=movies
        self.users=users
        self.ratings=ratings
        
    def __len__(self):
     #   'Denotes the total number of samples'
        return len(self.ratings)

    def __getitem__(self, index):
     #   'Generates one sample of data'
        # Select sample
        
        U = self.users[index]
        V =self.movies[index]
        y = self.ratings[index]
        #obs = {'movieId':movieId,'userId':userId,'rating':rating}
        #obs = self.movieLens.drop('timestamp',axis=1)
        #obs = obs.iloc[index,:].as_matrix()

        return [U,V,y]
    

In [27]:
users = torch.LongTensor(train.userId.values)
movies = torch.LongTensor(train.movieId.values)
ratings = torch.FloatTensor(train.rating.values)

train_dataset = MovieLens(users, movies, ratings)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)

In [28]:
#Valid loader
users_val = torch.LongTensor(val.userId.values)
movies_val = torch.LongTensor(val.movieId.values)
ratings_val = torch.FloatTensor(val.rating.values)

val_dataset = MovieLens(users_val, movies_val, ratings_val)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=True)



# test loader
users_test = torch.LongTensor(test.userId.values)
movies_test = torch.LongTensor(test.movieId.values)
ratings_test = torch.FloatTensor(test.rating.values)

test_dataset = MovieLens(users_test, movies_test, ratings_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)




In [29]:
# Printing one batch from the train dataloader
t1= iter(train_loader)
next(t1)


[tensor([176, 372, 322,  90,  29, 281, 469, 349, 110, 456,  69, 456, 186, 212,
         387,  92, 305, 477, 352, 231, 293, 101, 379,  72,  22, 231,  14, 451,
         432, 110, 239,  14,  47, 194,  96, 291, 267, 231, 451,  72, 211,  30,
         305, 305, 379, 312, 471, 451, 357, 422, 410,  56,  14, 462, 346, 145,
         357, 416, 327, 421, 211, 246, 219, 290, 277, 338, 164,  93, 471, 354,
          91, 310, 109, 298, 194,  76, 362, 212, 438, 164,  25, 158, 404, 212,
         467,  55, 462, 460, 211, 148, 401,  98,  80,  17, 261,   3, 460,   2,
         180, 444]),
 tensor([ 859,  898,  473, 3274, 2323,  441, 2030,  933,  818,  321, 3621, 1454,
          489, 3174, 4786,   91,  728,  492, 4213,  159, 2210, 2583,  365,   15,
         2426, 4607,  840, 2126,   91, 4525, 1428,  701, 1581, 1055, 1289, 4301,
          436, 4419, 4759, 3704, 3724, 2175,  177, 1068,  346, 1336,  977,  364,
          656, 2176,  179,  673,  954,  645,  143,  876, 4610, 2995,  184,  392,
         4112, 2123, 

In [30]:
n_hidden=20

class Model(nn.Module):
    def __init__(self, num_user, num_movie, emb_size=50):
        super(Model, self).__init__()
        self.userEmb = nn.Embedding(num_user, emb_size)
        self.movieEmb = nn.Embedding(num_movie, emb_size)
        
        self.lin = nn.Linear(in_features = emb_size*2, out_features= n_hidden)
        self.lin2 = nn.Linear(in_features = n_hidden, out_features=1)
        
        self.drop1 = nn.Dropout(0.25)
        # dropout??
        # other stuff??
        # ??????
       
        
    def forward(self, u, v):
        
        U = self.userEmb(u)
        V = self.movieEmb(v)
        x = torch.cat([U, V], dim=1)
      
    
        x = self.drop1(x)
        x = self.lin(x)
        
        x = F.relu(x)
        
        x = self.lin2(x)
      
        return x
    
    
    

In [31]:
num_user = len(ratings_dataset_e.userId.unique()) 
num_movie = len(ratings_dataset_e.movieId.unique())

In [32]:
emb_size = 100
model=Model(num_user,num_movie,emb_size)
model.parameters()

<generator object Module.parameters at 0x121c32a98>

In [33]:
[p for p in model.parameters()]

[Parameter containing:
 tensor([[ 0.5475,  1.0680, -0.7663,  ..., -1.3638,  0.1031,  0.0401],
         [ 1.3107, -1.5354,  0.0470,  ..., -0.6671, -0.1503, -1.7279],
         [-0.6035,  0.1632, -0.9866,  ..., -0.3765, -1.4632,  1.2275],
         ...,
         [ 1.4701, -0.6156,  1.7890,  ..., -0.2021,  0.6346,  0.7224],
         [-0.1341, -0.7536, -0.9832,  ..., -0.1153, -1.5532,  1.0675],
         [ 0.3156,  0.3769, -0.1437,  ..., -0.5575,  0.0591, -0.0433]],
        requires_grad=True), Parameter containing:
 tensor([[ 1.3306,  1.0489, -0.3052,  ...,  0.7721, -0.6707,  0.1450],
         [ 0.9294, -0.0330, -0.1173,  ...,  0.7009, -0.7201, -0.6672],
         [-1.3018,  0.4468, -1.5852,  ..., -1.6017, -0.6585,  0.7772],
         ...,
         [ 2.1443,  0.9254, -2.8560,  ..., -0.8815, -0.2108, -0.3349],
         [ 0.1611,  0.5628, -0.8110,  ...,  1.1310, -0.6468, -0.9489],
         [ 0.3549,  0.1230, -1.2544,  ..., -0.0491, -0.0114,  1.3473]],
        requires_grad=True), Parameter conta

In [34]:
def test_loss(model):
    model.eval() # go to evaluation mode
    
    running_loss = 0.
    for j, data in enumerate(val_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
            
        
        y_hat = model(users, items)
        loss_now = F.mse_loss(y_hat, ratings.unsqueeze(1))
        running_loss+= loss_now.item()
    print("validation loss", ": ", running_loss/len(val_loader)) # j means this many iterations till end
    

In [35]:
def train_epocs(model, epochs=2, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters()) # get all parameters which need grad
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train() # into training mode
    for i in range(epochs):
        running_loss = 0.0
        
        for j, data in enumerate(train_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
      

            y_hat = model(users, items)
            loss = F.mse_loss(y_hat, ratings.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print("training loss for epoch ",i+1, ": ", running_loss/j+1) # used to be loss.data[0]
    test_loss(model)

In [None]:
train_epocs(model, epochs=5, lr=0.01)