In [1]:
import numpy as np
import pandas as pd 
import torch 
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

# Importin the dataset

In [2]:
movies = pd.read_csv(filepath_or_buffer='movies.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [3]:
movies.head

<bound method NDFrame.head of          0                                   1                             2
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
...    ...                                 ...                           ...
3878  3948             Meet the Parents (2000)                        Comedy
3879  3949          Requiem for a Dream (2000)                         Drama
3880  3950                    Tigerland (2000)                         Drama
3881  3951             Two Family House (2000)                         Drama
3882  3952               Contender, The (2000)                Drama|Thriller

[3883 rows x 3 columns]>

In [4]:
users = pd.read_csv(filepath_or_buffer='users.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [5]:
users.head
# first column the id of the user
# second column to the gender
# third column to the age
# fourth column to the work group

<bound method NDFrame.head of          0  1   2   3      4
0        1  F   1  10  48067
1        2  M  56  16  70072
2        3  M  25  15  55117
3        4  M  45   7  02460
4        5  M  25  20  55455
...    ... ..  ..  ..    ...
6035  6036  F  25  15  32603
6036  6037  F  45   1  76006
6037  6038  F  56   1  14706
6038  6039  F  45   0  01060
6039  6040  M  25   6  11106

[6040 rows x 5 columns]>

In [6]:
ratings = pd.read_csv(filepath_or_buffer='ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')
ratings.head
# first column to the user
# second column to the movie
# third column to the ratings(1-5)
# fourth column date of the rating

<bound method NDFrame.head of             0     1  2          3
0           1  1193  5  978300760
1           1   661  3  978302109
2           1   914  3  978301968
3           1  3408  4  978300275
4           1  2355  5  978824291
...       ...   ... ..        ...
1000204  6040  1091  1  956716541
1000205  6040  1094  5  956704887
1000206  6040   562  5  956704746
1000207  6040  1096  4  956715648
1000208  6040  1097  4  956715569

[1000209 rows x 4 columns]>

# Preparing the training set and test set

In [7]:
training_set = pd.read_csv('u1.base', delimiter='\t', )
training_set
# first column the id user
# second column to the movie
# third column to the rating
# fourth column to the timestamp

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561
...,...,...,...,...
79994,943,1067,2,875501756
79995,943,1074,4,888640250
79996,943,1188,3,888640250
79997,943,1228,3,888640275


In [8]:
training_set = training_set.iloc[:, :-1].values

In [9]:
training_set = np.array(training_set, dtype='int64')
training_set

array([[   1,    2,    3],
       [   1,    3,    4],
       [   1,    4,    3],
       ...,
       [ 943, 1188,    3],
       [ 943, 1228,    3],
       [ 943, 1330,    3]], dtype=int64)

In [10]:
test_set = pd.read_csv('u1.test', delimiter='\t')
test_set = test_set.iloc[:, :-1]
test_set = np.array(test_set, dtype='int64')
test_set

array([[  1,  10,   3],
       [  1,  12,   5],
       [  1,  14,   5],
       ...,
       [459, 934,   3],
       [460,  10,   3],
       [462, 682,   5]], dtype=int64)

# Getting the number of users and movies

In [11]:
nb_users = int(max(max(training_set[:,0]), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:, 1])))

print(nb_users, nb_movies)

943 1682


# Converting the training set and the test set where rows are users and columns are the movies

In [12]:
def convert(data):
    # we will create a list of list bc pytorch works with it
    new_data = []
    
    for id_users in range(1,nb_users+1):
        # to extract all the movies that are rated from the user
        id_movies = data[:,1][data[:,0]==id_users]
        # the ratings from the movies of each user
        id_ratings = data[:,2][data[:,0]==id_users]
        
        # initialize a list with 1682 movies with 0 and then replace it with the real ratings
        list_ratings = np.zeros(nb_movies)
        # id_movies - 1 bc movies start from 1 but the list_ratings starts from 0
        list_ratings[id_movies-1] = id_ratings
        
        new_data.append(list_ratings)
    
    return new_data

In [13]:
training_set = convert(training_set)
training_set

[array([0., 3., 4., ..., 0., 0., 0.]),
 array([4., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([4., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([3., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([5., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([5., 0., 0., ..., 0., 0., 0.]),
 array([0., 2., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([5., 0., 0., ..., 0., 0., 0.]),
 array([3., 0., 0., ..., 

In [14]:
test_set = convert(test_set)
test_set

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([4., 3., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([4., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 3., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([4., 0., 0., ..., 0., 0., 0.]),
 array([5., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([3., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([5., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 

# Converting the data into Torch tensors

In [15]:
training_set_torch = torch.FloatTensor(training_set)
test_set_torch = torch.FloatTensor(test_set)

In [16]:
training_set_torch

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [17]:
test_set_torch

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

# Converting the ratings into binary ratings 1-Liked, 0 not liked

In [18]:
# -1 for a movie that wasn't rated
training_set_torch[training_set_torch == 0] = -1
training_set_torch[training_set_torch == 1] = 0
training_set_torch[training_set_torch == 2] = 0
training_set_torch[training_set_torch >= 3] = 1

In [19]:
test_set_torch[test_set_torch == 0] = -1
test_set_torch[test_set_torch == 1] = 0
test_set_torch[test_set_torch == 2] = 0
test_set_torch[test_set_torch >= 3] = 1

# Creating the architecture of the NN

In [27]:
class RBM():
    
    def __init__(self, number_visible_nodes, number_hidden_nodes):
        # initialize a tensor of weights
        self.W = torch.randn(number_hidden_nodes, number_visible_nodes)
        
        # this is the sigmoid activation function
        # initialize the vector of biases  p(number_hidden_nodes/number_visible_nodes)
        self.a = torch.randn(1, number_hidden_nodes)
        
        # initialize the vector of biases  p(number_visible_nodes/number_hidden_nodes)
        self.b = torch.randn(1, number_visible_nodes)
        
    # it will activate the hidden nodes
    def sample_hidden_nodes(self, x):
        
        x = torch.FloatTensor(x)
        # the product of the visible nodes and the weights(product of 2 torch tensors)
        wx = torch.mm(x, self.W.t())
        # the activation function 
        activation = wx + self.a.expand_as(wx)
        # the probability that the hidden nodes get activated given the value of the visible node
        p_h_given_v = torch.sigmoid(activation)
        
        # torch.bernoulli(p_h_given_v), it does the sampling of the hidden neurons, it means which ones got activated
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    def sample_visible_nodes(self, y): 
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        # the probability that the visible nodes get activated given the value of the hidden nodes
        p_v_given_h = torch.sigmoid(activation)
        
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    # v0 is the row of observations
    # vk after k sampling
    # ph0 the probability of the hidden node for that row
    # phk the probability of the hidden node after k samplig given vk
    def train(self, v0, vk, ph0, phk):
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk))
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((pho - phk), 0)
        

# Create a RBM object

In [28]:
# the number of visible nodes are the number of movies
number_visible_nodes = len(training_set[0])
# correspond to the number of features we want to have
number_hidden_nodes = 100

batch_size = 100

In [29]:
rbm = RBM(number_visible_nodes, number_hidden_nodes)

# Training the RBM

In [38]:
number_epochs = 10
for epoch in range (1,number_epochs + 1):
    train_lost = 0
    # counter for loss
    s = 0.
    for id_user in range(0, nb_users - batch_size, batch_size):
        # the inputs
        vk = training_set[id_user:id_user+batch_size]
        # the targets
        v0 = training_set[id_user:id_user+batch_size]
        # getting the probability of the hidden nodes
        # to return only the first element of the function that is going to be used
        ph0,_ = rbm.sample_hidden_nodes(v0)
        
        # for the contrastive divergence
        for k in range(10):
            # to get only the second element returned from the function
            _, hk = rbm.sample_hidden_nodes(vk)
            # update of the visible node
            _, vk = rbm.sample_visible_nodes(hk)
            # freezing the inputs that equals -1
            vk[v0<0] = v0[v0<0]
        
        phk,_ = rbm.sample_hidden_nodes(vk)
        
        rbm.train(v0, vk, ph0, phk)
        
        train_lost += torch.mean(torch.abs(v0[v0>=0] - vk[vk>=0]))
        
        s += 1. 
        
        # str(train_lost/s) to normalize the train loss
        print('epoch: ', str(epoch), '\t', 'loss: ', str(train_lost/s))
                  
            
    
    

TypeError: '<' not supported between instances of 'list' and 'int'

In [None]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
        s += 1.
print('test loss: '+str(test_loss/s))