Parameters in the model
Matrix R: containing the ratings 
Matrix U: Containing the embeddings of the users
Matrix M: Containing the embedding of the movies


In [1]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from scipy.sparse import rand as sprand
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils import data


In [2]:
# The grand dataset
ratings_dataset = pd.read_csv('ratings_small.csv', usecols = ['userId','movieId','rating'])

#ratings

In [3]:
# Creating new indices

def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [4]:
def encode_data(ratings_dataset, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    ratings_dataset = ratings_dataset.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(ratings_dataset[col_name], train_col)
        ratings_dataset[col_name] = col
        ratings_dataset = ratings_dataset[ratings_dataset[col_name] >= 0]
    return ratings_dataset

In [5]:
ratings_dataset_e=encode_data(ratings_dataset)

In [6]:
# split train and validation before encoding
trn_len = ratings_dataset_e.shape[0]- int(0.3*ratings_dataset_e.shape[0])
val_len = int(0.2*ratings_dataset_e.shape[0])
test_len = int(0.1*ratings_dataset_e.shape[0])

train = ratings_dataset_e[:trn_len-1].copy()
val = ratings_dataset_e[trn_len:-test_len].copy()
test = ratings_dataset_e[trn_len:-val_len].copy()

print(len(train))
print(len(val))
print(len(test))

70002
20001
10001


In [7]:
train.shape[0]+ val.shape[0]+ test.shape[0] == ratings_dataset_e.shape[0]

True

In [8]:
import torch
from torch.utils import data

# Formatting the data 

class MovieLens(Dataset):
  #'Characterizes a dataset for PyTorch'
    def __init__(self,users,movies,ratings):
        'Initialization'
        self.movies=movies
        self.users=users
        self.ratings=ratings
        
    def __len__(self):
     #   'Denotes the total number of samples'
        return len(self.ratings)

    def __getitem__(self, index):
     #   'Generates one sample of data'
        # Select sample
        
        U = self.users[index]
        V =self.movies[index]
        y = self.ratings[index]
        #obs = {'movieId':movieId,'userId':userId,'rating':rating}
        #obs = self.movieLens.drop('timestamp',axis=1)
        #obs = obs.iloc[index,:].as_matrix()

        return [U,V,y]
    

In [9]:
users = torch.LongTensor(train.userId.values)
movies = torch.LongTensor(train.movieId.values)
ratings = torch.FloatTensor(train.rating.values)

train_dataset = MovieLens(users, movies, ratings)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)

In [10]:
#Valid loader
users_val = torch.LongTensor(val.userId.values)
movies_val = torch.LongTensor(val.movieId.values)
ratings_val = torch.FloatTensor(val.rating.values)

val_dataset = MovieLens(users_val, movies_val, ratings_val)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=True)



# test loader
users_test = torch.LongTensor(test.userId.values)
movies_test = torch.LongTensor(test.movieId.values)
ratings_test = torch.FloatTensor(test.rating.values)

test_dataset = MovieLens(users_test, movies_test, ratings_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)




In [11]:
# Printing one batch from the train dataloader
t1= iter(train_loader)
next(t1)


[tensor([ 474,  361,  262,  261,  477,   22,  212,  312,  238,  477,
          239,  414,  293,  377,  352,  252,  232,   93,  100,  264,
           96,   22,  345,  200,   55,  230,  344,   22,  460,   22,
          379,  459,  267,  222,  262,  429,  392,   22,  171,  284,
           14,  101,  104,  387,  185,  452,  456,  109,  262,  284,
           22,  293,   68,  406,  305,  379,  357,  368,  148,  407,
          231,  104,  211,  389,  445,  291,  460,  406,  119,  211,
          400,  456,  219,   25,  456,  168,  479,  211,  211,  104,
          432,   67,  387,  188,   14,  151,  288,  406,   97,  346,
           55,  451,  406,  123,  399,  175,  213,  310,  242,  243]),
 tensor([  553,   402,   110,  5819,  1550,   124,   282,  1162,  2592,
           387,  5293,   148,    19,  1652,   323,  1079,   714,  3181,
          2251,   160,  3579,   776,    18,   126,   111,    49,   186,
          2428,  2216,   900,  1690,   696,   496,  3596,   489,  2471,
          5762,   58

In [12]:
class Model(nn.Module):
    def __init__(self, num_user, num_movie, emb_size=100):
        super(Model, self).__init__()
        self.userEmb = nn.Embedding(num_user, emb_size)
        self.movieEmb = nn.Embedding(num_movie, emb_size)
        
        
    def forward(self, u, v):
        U = self.userEmb(u)
        V = self.movieEmb(v)
        return (U*V).sum(1)

In [13]:
num_user = len(train.userId.unique()) 
num_movie = len(train.movieId.unique())

In [14]:
emb_size = 100
model=Model(num_user,num_movie,emb_size)
model.parameters()

<generator object Module.parameters at 0x119413af0>

In [15]:
[p for p in model.parameters()]

[Parameter containing:
 tensor([[ 9.2559e-01,  1.3112e-01, -2.0608e-01,  ...,  1.3571e+00,
           5.4470e-01,  2.0903e-01],
         [-2.7771e-01, -1.4926e+00, -9.5142e-01,  ...,  2.2129e+00,
           2.5391e+00, -3.4938e-01],
         [-3.9371e-01,  1.7334e+00, -1.5802e+00,  ..., -9.0567e-01,
          -2.9499e-01, -1.9444e+00],
         ...,
         [ 1.4395e+00,  2.7408e-01, -1.6173e-01,  ..., -1.3254e-01,
           1.1755e+00, -3.3505e-02],
         [ 5.5287e-01,  5.8283e-01, -5.9406e-02,  ...,  1.1176e-01,
          -1.1838e+00, -1.1384e+00],
         [-4.6642e-01, -1.1818e-01,  2.2425e-02,  ...,  4.5563e-01,
           1.5919e+00, -6.2725e-01]]), Parameter containing:
 tensor([[ 1.3931e+00,  5.0361e-01,  8.7920e-01,  ...,  2.4264e-01,
           1.3643e-01,  6.5278e-01],
         [ 1.1166e+00,  2.2333e+00,  4.1351e-01,  ...,  2.2008e-01,
           5.0890e-02,  4.3604e-01],
         [-5.0180e-01,  7.9520e-01,  1.1026e+00,  ...,  2.1945e-01,
           3.7534e-01, -4.2002e

In [29]:
def test_loss(model):
    model.eval() # go to evaluation mode
    
    running_loss = 0.
    for j, data in enumerate(val_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
            
        
        y_hat = model(users, items)
        loss_now = F.mse_loss(y_hat, ratings)
        running_loss+= loss_now.data[0]
        
        print(running_loss)
    print("validation loss", ": ", running_loss/len(val_loader)) # j means this many iterations till end

In [30]:
def train_epocs(model, epochs=2, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters()) # get all parameters which need grad
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train() # into training mode
    for i in range(epochs):
        running_loss = 0.0
        
        for j, data in enumerate(train_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
      

            y_hat = model(users, items)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.data[0]
        print("training loss for epoch ",i+1, ": ", running_loss/j+1) # used to be loss.data[0]
    test_loss(model)

In [31]:
train_epocs(model, epochs=2, lr=0.01)



training loss for epoch  1 :  tensor(2.8671)
training loss for epoch  2 :  tensor(2.7537)


RuntimeError: index out of range at /Users/soumith/minicondabuild3/conda-bld/pytorch_1524590658547/work/aten/src/TH/generic/THTensorMath.c:343