In [13]:
#import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [14]:
#import movies dataset
movies = pd.read_csv('movies.dat',sep='::',header=None,engine='python',encoding='latin-1')
movies

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [15]:
#import users dataset
users = pd.read_csv('users.dat',sep='::',header=None,engine='python',encoding='latin-1')
users

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [16]:
#import ratings dataset
ratings = pd.read_csv('ratings.dat',sep='::',header=None,engine='python',encoding='latin-1')
ratings

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [17]:
#prepare training and test set
training_set = pd.read_csv('u1.base',delimiter='\t')
training_set = np.array(training_set, dtype='int')

test_set = pd.read_csv('u1.test',delimiter='\t')
test_set = np.array(test_set, dtype='int')

In [18]:
#get number of users and movies
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))

In [19]:
#convert training and test set into 2D array (row - users, column - movies, cells - ratings)
def convert(data):
  new_data = [] #final array
  for id_users in range(1,nb_users+1):
    id_movies = data[:,1][data[:,0]==id_users] #get all movies by first user
    id_ratings = data[:,2][data[:,0]==id_users] #get all movie ratings by first user
    ratings = np.zeros(nb_movies)
    ratings[id_movies-1] = id_ratings
    new_data.append(list(ratings))
  return new_data

training_set = convert(training_set)
test_set = convert(test_set)

In [20]:
#convert list of lists to torch tensors
training_set = torch.FloatTensor(training_set) #multidimensional array of float type
test_set = torch.FloatTensor(test_set)

In [28]:
#create architecture of neural network
class SAE(nn.Module):
  def __init__(self,): #self refers to object
    super(SAE,self).__init__() #inherits all super class methods and attributes
    self.fc1 = nn.Linear(nb_movies,20) #20 nodes in first hidden layer (vector of 20 elements)
    self.fc2 = nn.Linear(20,10) #10 nodes in second hidden layer
    self.fc3 = nn.Linear(10,20) #20 nodes in third hidden layer
    self.fc4 = nn.Linear(20,nb_movies) #nb_movies nodes in output layer
    self.activation = nn.Sigmoid() #sigmoid activation function

  def forward(self, x):
    x = self.activation(self.fc1(x)) #first encoding
    x = self.activation(self.fc2(x)) #second encoding
    x = self.activation(self.fc3(x)) #first decoding
    x = self.fc4(x) #final decoding
    return x

sae = SAE()
criterion = nn.MSELoss() #Mean Squared Error - Loss function
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay = 0.5)

In [29]:
#training the sae
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data.item()*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

epoch: 1 loss: 1.7712826487309508
epoch: 2 loss: 1.096651467844884
epoch: 3 loss: 1.053627224867357
epoch: 4 loss: 1.038421864541439
epoch: 5 loss: 1.0307147183497407
epoch: 6 loss: 1.0267619938935633
epoch: 7 loss: 1.0237954220351269
epoch: 8 loss: 1.0218886602975539
epoch: 9 loss: 1.0207672528120615
epoch: 10 loss: 1.0197975462894702
epoch: 11 loss: 1.0191915884070457
epoch: 12 loss: 1.0182892103824326
epoch: 13 loss: 1.0177978719324958
epoch: 14 loss: 1.0175949246044507
epoch: 15 loss: 1.0170892005993362
epoch: 16 loss: 1.0165928215301354
epoch: 17 loss: 1.0166956124706217
epoch: 18 loss: 1.0166847291712078
epoch: 19 loss: 1.0163216217384985
epoch: 20 loss: 1.0159068768028847
epoch: 21 loss: 1.0160491290648215
epoch: 22 loss: 1.0159949360513065
epoch: 23 loss: 1.0159071683895753
epoch: 24 loss: 1.015535393466843
epoch: 25 loss: 1.015514676477142
epoch: 26 loss: 1.0154406474547784
epoch: 27 loss: 1.0154464490279826
epoch: 28 loss: 1.0151072343136873
epoch: 29 loss: 1.0130686977526824

In [31]:
#testing the sae
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data.item()*mean_corrector)
        s += 1.
print('loss: '+str(test_loss/s))

loss: 0.9517372162765255
