## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Importing the dataset


In [2]:
movies = pd.read_csv('datasets/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('datasets/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('datasets/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

## Preparing the training set and the test set


In [3]:
training_set = pd.read_csv('datasets/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('datasets/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [5]:
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [6]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

In [7]:
len(training_set)

79999

In [8]:
len(test_set)

19999

In [9]:
training_set.shape

(79999, 4)

In [10]:
max(training_set[:,0],)

943

In [11]:
max(test_set[:,0])

462

In [14]:
max(max(training_set[:, 0], ), max(test_set[:, 0]))

943

In [15]:
max(max(training_set[:, 1], ), max(test_set[:, 1]))

1682

## Getting the number of users and movies


In [12]:
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))

In [13]:
nb_users

943

In [14]:
nb_movies

1682

In [16]:
training_set[:, 1] [training_set[:, 0] == 1]

array([  2,   3,   4,   5,   7,   8,   9,  11,  13,  15,  16,  18,  19,
        21,  22,  25,  26,  28,  29,  30,  32,  34,  35,  37,  38,  40,
        41,  42,  43,  45,  46,  48,  50,  52,  55,  57,  58,  59,  63,
        66,  68,  71,  75,  77,  79,  83,  87,  88,  89,  93,  94,  95,
        99, 101, 105, 106, 109, 110, 111, 115, 116, 119, 122, 123, 124,
       126, 127, 131, 133, 135, 136, 137, 138, 139, 141, 142, 144, 146,
       147, 149, 152, 153, 156, 158, 162, 165, 166, 167, 168, 169, 172,
       173, 176, 178, 179, 181, 182, 187, 191, 192, 194, 195, 197, 198,
       199, 203, 204, 205, 207, 211, 216, 217, 220, 223, 231, 234, 237,
       238, 239, 240, 244, 245, 246, 247, 249, 251, 256, 257, 261, 263,
       268, 269, 270, 271])

In [19]:
len(training_set[:,1][training_set[:, 0] == 1])

134

## Converting the data into an array with users in lines and movies in columns


In [20]:
def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data

In [21]:
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors


In [22]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [23]:
training_set.shape

torch.Size([943, 1682])

In [29]:
training_set

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [46]:
test_set.shape

torch.Size([943, 1682])

## Creating the architecture of the Neural Network


In [31]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x


In [32]:
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

## Training the SAE


In [33]:
nb_epoch = 50
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  s = 0.
  for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = input.clone()
    if torch.sum(target.data > 0) > 0:
      output = sae(input)
      target.require_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data*mean_corrector)
      s += 1.
      optimizer.step()
  print('epoch: '+str(epoch)+'loss: '+ str(train_loss/s))

epoch: 1loss: tensor(1.7710)
epoch: 2loss: tensor(1.0968)
epoch: 3loss: tensor(1.0536)
epoch: 4loss: tensor(1.0384)
epoch: 5loss: tensor(1.0308)
epoch: 6loss: tensor(1.0268)
epoch: 7loss: tensor(1.0241)
epoch: 8loss: tensor(1.0219)
epoch: 9loss: tensor(1.0208)
epoch: 10loss: tensor(1.0198)
epoch: 11loss: tensor(1.0188)
epoch: 12loss: tensor(1.0186)
epoch: 13loss: tensor(1.0178)
epoch: 14loss: tensor(1.0176)
epoch: 15loss: tensor(1.0174)
epoch: 16loss: tensor(1.0168)
epoch: 17loss: tensor(1.0167)
epoch: 18loss: tensor(1.0165)
epoch: 19loss: tensor(1.0164)
epoch: 20loss: tensor(1.0163)
epoch: 21loss: tensor(1.0159)
epoch: 22loss: tensor(1.0161)
epoch: 23loss: tensor(1.0159)
epoch: 24loss: tensor(1.0158)
epoch: 25loss: tensor(1.0159)
epoch: 26loss: tensor(1.0157)
epoch: 27loss: tensor(1.0153)
epoch: 28loss: tensor(1.0150)
epoch: 29loss: tensor(1.0128)
epoch: 30loss: tensor(1.0120)
epoch: 31loss: tensor(1.0100)
epoch: 32loss: tensor(1.0100)
epoch: 33loss: tensor(1.0055)
epoch: 34loss: tens

## Testing the SAE


In [34]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(training_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(1.0053)


In [35]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)

user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
user_input = Variable(training_set[user_id]).unsqueeze(0)

predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)

result_array = np.hstack([movie_title, user_target, predicted])

result_array = result_array[result_array[:, 1] > 0]

result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

In [53]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,GoldenEye (1995),3,3.61411
1,Dracula: Dead and Loving It (1995),5,4.06234
2,Nixon (1995),5,3.48698
3,Sense and Sensibility (1995),3,3.05664
4,Money Train (1995),4,2.96668
5,Assassins (1995),4,3.60112
6,Powder (1995),3,3.29176
7,Now and Then (1995),2,3.00231
8,Dangerous Minds (1995),3,3.39532
9,Wings of Courage (1995),4,3.23101
