In [1]:
import numpy as np
import pandas as pd 
import torch 
import torch.nn as nn
import torch.nn.parallel 
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

# Importing the datasets

In [2]:
movies = pd.read_csv('movies.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [3]:
users = pd.read_csv('users.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [4]:
ratings = pd.read_csv('ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')

# Prepare the training set

In [5]:
training_set = pd.read_csv('u1.base', delimiter='\t', header=None)
training_set.shape 

(80000, 4)

In [6]:
training_set = np.array(training_set, dtype='int64')
training_set

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]], dtype=int64)

In [7]:
test_set = pd.read_csv('u1.test', delimiter='\t')

In [8]:
test_set = np.array(test_set, dtype='int64')
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]], dtype=int64)

## Getting the number of users and movies


In [9]:
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))

## Converting the data into an array with users in lines and movies in columns


In [10]:
def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors


In [11]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

# Create the architecture of the NN

In [13]:
#stack autoencoders
class SAE(nn.Module):

    def __init__(self,):
        super(SAE, self).__init__()
        #full connection
        #nn.Linear() it needs the number of inputs and the number of hidden neurons
        self.fc1 = nn.Linear(nb_movies, 20)
        
        #second full connection
        self.fc2 = nn.Linear(20,10)
        
        #third full connection where we start to decode
        self.fc3 = nn.Linear(10, 20)
        
        #fourth full connection
        self.fc4 = nn.Linear(20,nb_movies)
        
        #the activation function sigmoid
        self.activation = nn.Sigmoid()
    
    #for encoding
    def forward(self, x):
        #the activation function will activate the neurons for encodin and returns the vector encoded
        x = self.activation(self.fc1(x))
        #encoding in the 2 layer
        x = self.activation(self.fc2(x))
        #decoding in the 3 layer
        x = self.activation(self.fc3(x))
        #decoding in the 4 layer
        x = self.fc4(x)
        
        return x
           

In [14]:
sae = SAE()
criterion = nn.MSELoss()
#root mean square optimizer( all the parameters tha define the architecture of the NN, learning rate, the dacay to reduce
#   the learning rate after certain number of epochs)
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay= 0.5)

# Training the SAE

In [17]:
NUMBER_EPOCHS = 200

for epoch in range(1,NUMBER_EPOCHS+1):
    train_loss = 0
    #to save the users who at least rated one movie
    s = 0.
    
    for id_user in range(nb_users):
        input_user = Variable(training_set[id_user]).unsqueeze(0)
        target_user = input_user.clone()
        
        #to check if ht euser rated at least one movie
        if torch.sum(target_user.data > 0) > 0:
            output_prediction = sae.forward(input_user)
            #to reduce computations so we dont compute the gradient with the respective target
            target_user.require_grad = False
            #for the not rated movies so they stay the same
            output_prediction[target_user == 0] = 0
            
            loss = criterion(output_prediction, target_user)
            #to avoid the mean is 0 it's added 1e-10 and it is the average odf the rated movies
            mean_corrector = nb_movies/float(torch.sum(target_user.data > 0) + 1e-10) 
            
            #to decide to increase or decrease the value of the weights
            loss.backward()
            
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            
            #decides the intensity to upgrade the weights
            optimizer.step()
            
    print('epoch: ', str(epoch), ', loss: ', str(train_loss/s))
    #the loss represents the variation between the predicted rating and the real ones

epoch:  1 , loss:  tensor(0.9128)
epoch:  2 , loss:  tensor(0.9127)
epoch:  3 , loss:  tensor(0.9125)
epoch:  4 , loss:  tensor(0.9124)
epoch:  5 , loss:  tensor(0.9125)
epoch:  6 , loss:  tensor(0.9117)
epoch:  7 , loss:  tensor(0.9119)
epoch:  8 , loss:  tensor(0.9118)
epoch:  9 , loss:  tensor(0.9120)
epoch:  10 , loss:  tensor(0.9116)
epoch:  11 , loss:  tensor(0.9115)
epoch:  12 , loss:  tensor(0.9110)
epoch:  13 , loss:  tensor(0.9113)
epoch:  14 , loss:  tensor(0.9108)
epoch:  15 , loss:  tensor(0.9108)
epoch:  16 , loss:  tensor(0.9107)
epoch:  17 , loss:  tensor(0.9106)
epoch:  18 , loss:  tensor(0.9105)
epoch:  19 , loss:  tensor(0.9107)
epoch:  20 , loss:  tensor(0.9105)
epoch:  21 , loss:  tensor(0.9103)
epoch:  22 , loss:  tensor(0.9103)
epoch:  23 , loss:  tensor(0.9101)
epoch:  24 , loss:  tensor(0.9099)
epoch:  25 , loss:  tensor(0.9098)
epoch:  26 , loss:  tensor(0.9098)
epoch:  27 , loss:  tensor(0.9094)
epoch:  28 , loss:  tensor(0.9095)
epoch:  29 , loss:  tensor(0.

In [16]:
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  s = 0.
  for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = input.clone()
    if torch.sum(target.data > 0) > 0:
      output = sae(input)
      target.require_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data*mean_corrector)
      s += 1.
      optimizer.step()
  print('epoch: '+str(epoch)+'loss: '+ str(train_loss/s))

epoch: 1loss: tensor(1.7646)
epoch: 2loss: tensor(1.0966)
epoch: 3loss: tensor(1.0533)
epoch: 4loss: tensor(1.0386)
epoch: 5loss: tensor(1.0307)
epoch: 6loss: tensor(1.0265)
epoch: 7loss: tensor(1.0237)
epoch: 8loss: tensor(1.0221)
epoch: 9loss: tensor(1.0206)
epoch: 10loss: tensor(1.0198)
epoch: 11loss: tensor(1.0188)
epoch: 12loss: tensor(1.0185)
epoch: 13loss: tensor(1.0176)
epoch: 14loss: tensor(1.0176)
epoch: 15loss: tensor(1.0169)
epoch: 16loss: tensor(1.0170)
epoch: 17loss: tensor(1.0167)
epoch: 18loss: tensor(1.0165)
epoch: 19loss: tensor(1.0163)
epoch: 20loss: tensor(1.0164)
epoch: 21loss: tensor(1.0160)
epoch: 22loss: tensor(1.0158)
epoch: 23loss: tensor(1.0156)
epoch: 24loss: tensor(1.0157)
epoch: 25loss: tensor(1.0156)
epoch: 26loss: tensor(1.0156)
epoch: 27loss: tensor(1.0151)
epoch: 28loss: tensor(1.0150)
epoch: 29loss: tensor(1.0130)
epoch: 30loss: tensor(1.0119)
epoch: 31loss: tensor(1.0102)
epoch: 32loss: tensor(1.0080)
epoch: 33loss: tensor(1.0069)
epoch: 34loss: tens

# Testing the SAE

In [18]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    #the ratings
    input = Variable(training_set[id_user]).unsqueeze(0)
    #contains the real rating of the movies
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.9509)
