# Part 1 - Dataset preparation

In [1]:
# Libraries
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable 

In [2]:
#DATASETS
movies = pd.read_csv("ml-1m/ml-1m/movies.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")
users = pd.read_csv("ml-1m/ml-1m/users.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")
ratings = pd.read_csv("ml-1m/ml-1m/ratings.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")

In [3]:
# Prepare the training set and the testing set
training_set = pd.read_csv("ml-100k/ml-100k/u1.base", sep = "\t", header=  None)
training_set = np.array(training_set, dtype = "int")

test_set = pd.read_csv("ml-100k/ml-100k/u1.test", sep = "\t", header=  None)
test_set = np.array(test_set, dtype = "int")

# to obtain the number of users and number of films 
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))

In [4]:
# convert the data into an array X[u,i] with users u in row and movies i in column 
def convert(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies-1] = id_ratings
        new_data.append(list(ratings))
    return new_data

# Part 2 - CREATION OF THE MODEL

In [5]:
# Use the function to extract the data
training_set = convert(training_set)
test_set = convert(test_set)

# Convert data to Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

#Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20) #first hidden layer that has 20 neurons
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)  #fc = full conected
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    
    def forward(self,x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x) ##To replicate the input, we cannot put a sigmoid output unless we first encode the input in the appropriate range, but that is not what we are looking for in this case.
 
        return x
    

sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr= 0.01, weight_decay= 0.5 ) #weight_decay = LR decay to adjust as iterations progress the lr // parameters is inherited from Module to transmit the parameters.

# Training the SAE
nb_epoch = 200
for epoch in range(1,nb_epoch+1):
    train_loss = 0 
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0) #We need the input to be an array, not a list of vectors, so we add a dimension with Variable that we have imported and then the unsqueeze method.
        target = input.clone()
        if torch.sum(target.data > 0) >0: # we add a vector that gives you true if it is greater than 0, so if there are 27 rated films it will be greater than 1 and will pass if there are no ratings it does not pass.
            output = sae.forward(input)
            target.requires_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0)+ 1e-10) #We want to see an overall average error rate for rating films, so we look at how many films you have rated out of the total, the 1e-10 is so that it never crashes, if the sum is 0.
            loss.backward() #propagate error backwards // only decides the direction of the weights.
            train_loss += np.sqrt(loss.data*mean_corrector) #where is the error in loss.data. We correct the data with the previous factor. Be careful the MSE is squared, we need to square root it.  ## sum(errors) / n_valued_films
            s += 1.
            optimizer.step()  #The optimiser decides HOW MUCH to multiply the weights to get to what we want, but not the direction.
            
    print("Epoch: " + str(epoch) + ", Loss: " + str(train_loss/s))     
            
            

  from .autonotebook import tqdm as notebook_tqdm


Epoch: 1, Loss: tensor(1.7715)
Epoch: 2, Loss: tensor(1.0967)
Epoch: 3, Loss: tensor(1.0536)
Epoch: 4, Loss: tensor(1.0383)
Epoch: 5, Loss: tensor(1.0310)
Epoch: 6, Loss: tensor(1.0268)
Epoch: 7, Loss: tensor(1.0239)
Epoch: 8, Loss: tensor(1.0220)
Epoch: 9, Loss: tensor(1.0208)
Epoch: 10, Loss: tensor(1.0195)
Epoch: 11, Loss: tensor(1.0189)
Epoch: 12, Loss: tensor(1.0184)
Epoch: 13, Loss: tensor(1.0177)
Epoch: 14, Loss: tensor(1.0177)
Epoch: 15, Loss: tensor(1.0172)
Epoch: 16, Loss: tensor(1.0169)
Epoch: 17, Loss: tensor(1.0166)
Epoch: 18, Loss: tensor(1.0165)
Epoch: 19, Loss: tensor(1.0164)
Epoch: 20, Loss: tensor(1.0162)
Epoch: 21, Loss: tensor(1.0160)
Epoch: 22, Loss: tensor(1.0158)
Epoch: 23, Loss: tensor(1.0158)
Epoch: 24, Loss: tensor(1.0159)
Epoch: 25, Loss: tensor(1.0156)
Epoch: 26, Loss: tensor(1.0158)
Epoch: 27, Loss: tensor(1.0155)
Epoch: 28, Loss: tensor(1.0152)
Epoch: 29, Loss: tensor(1.0128)
Epoch: 30, Loss: tensor(1.0121)
Epoch: 31, Loss: tensor(1.0095)
Epoch: 32, Loss: 

# PART 3 - EVALUATION OF THE MODEL 

In [6]:
# Evaluate the test assembly
test_loss = 0 
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)  #You need the input to be an array, not a list of vectors, so you add a dimension with Variable that we have imported and then the unsqueeze method.
    target = Variable(test_set[id_user]).unsqueeze(0) 
    if torch.sum(target.data > 0) >0: # we add a vector that gives you true if it is greater than 0, so if there are 27 rated films it will be greater than 1 and will pass if there are no ratings it does not pass.
        output = sae.forward(input)
        target.requires_grad = False #the target set remains unchanged we save caclulus
        output[target == 0] = 0 # this makes sense in the test, but in the future we would have to remove it to see ALL RATINGS not just the test score.
        test_loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0)+ 1e-10) #We want to see an overall average error rate for rating films, so we look at how many films you have rated out of the total, the 1e-10 is so that it never crashes, if the sum is 0.
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
    
print("Test Loss: " + str(train_loss/s))     
            

Test Loss: tensor(1.8757)


We can see how in this case we have an average error of 1.87 stars in the films we have evaluated, in this case it is quite large, if it were not a case study to see how an autoencoder works now we would have to modify parameters to reduce this error to less than 1 average star. 