### Recommender system

Whether a user will like a movie or not - Yes/No (binary)

Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from pathlib import Path
import os

Importing the dataset

In [None]:
cwd = Path.cwd()
path = cwd / 'datasets/Movie Recommender'

movies = pd.read_csv(os.path.join(path, 'ml-1m/movies.dat'), sep='::',
                     header = None, engine='python',
                     encoding='latin-1')

# movie_id | movie name | genre
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
users = pd.read_csv(os.path.join(path, 'ml-1m/users.dat'), sep='::',
                     header = None, engine='python',
                     encoding='latin-1')
# user_id | gender | age | code ~users job | visit code
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
ratings = pd.read_csv(os.path.join(path, 'ml-1m/ratings.dat'), sep='::',
                     header = None, engine='python',
                     encoding='latin-1')
# users | movie ids | ratings (1 - dislike, 5 - liked) | some timestamps
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Preparing the training set and the test set

In [None]:
training_set = pd.read_csv(os.path.join(path, 'ml-100k/u1.base'), delimiter='\t')
training_set = np.array(training_set, dtype='int')

In [None]:
training_set.shape

(79999, 4)

In [None]:
test_set = pd.read_csv(os.path.join(path, 'ml-100k/u1.test'), delimiter='\t')
test_set = np.array(test_set, dtype='int')

In [None]:
test_set.shape

(19999, 4)

Getting the number of users and movies

In [None]:
nb_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1]), max(test_set[:, 1])))

Converting the data into an array with users in lines and movies in columns

In [None]:
def convert(data):
  new_data = []
  for id_users in range(1, nb_users+1):
    id_movies = data[:,1][data[:, 0] == id_users]
    id_ratings = data[:,2][data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies-1] = id_ratings

    # if id_users == 1: print(ratings)
    new_data.append(list(ratings))

  return new_data

training_set = convert(training_set)
test_set = convert(test_set)

Converting the data into Torch tensors

In [None]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)

In [None]:
# not rated
training_set[training_set == 0] = -1

# not liked - if ratings are 1/2
training_set[training_set == 1] = 0 # or operator does not work in pytorch
training_set[training_set == 2] = 0

# liked - if ratings are 3/4/5
training_set[training_set >= 3] = 1

In [None]:
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

Creating the architecture of the Neural Network

In [None]:
class RBM():
  def __init__(self, nv, nh): 
    self.W = torch.randn(nh, nv) 
    self.a = torch.randn(1, nh) 
    self.b = torch.randn(1, nv)


  def sample_h(self, x):
    wx = torch.mm(x, self.W.t())
    activation = wx + self.a.expand_as(wx)  
    p_h_given_v = torch.sigmoid(activation)
    return p_h_given_v, torch.bernoulli(p_h_given_v)
    

  def sample_v(self, y):
    wy = torch.mm(y, self.W)
    activation = wy + self.b.expand_as(wy)  
    p_v_given_h = torch.sigmoid(activation)
    return p_v_given_h, torch.bernoulli(p_v_given_h)
    # y - no of hidden nodes


  def train(self, v0, vk, ph0, phk):
    self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
    self.b += torch.sum((v0 - vk), 0)
    self.a += torch.sum((ph0 - phk), 0)

In [None]:
nv = len(training_set[0])   
nh = 100  
batch_size = 100   

rbm = RBM(nv, nh)

Training the RBM

In [None]:
nb_epoch = 10

for epoch in range(nb_epoch):
  train_loss = 0
  s = 0.  

  for id_user in range(0, nb_users-batch_size, batch_size):
    vk = training_set[id_user: id_user + batch_size]  
    v0 = training_set[id_user: id_user + batch_size]  
    ph0 , _ = rbm.sample_h(v0) 

    for k in range(10): 
      _, hk = rbm.sample_h(vk)
      _, vk = rbm.sample_v(hk)
      vk[v0 < 0] = v0[v0 < 0]

    phk, _ = rbm.sample_h(vk)
    rbm.train(v0, vk, ph0, phk)
    train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
    s += 1.

  print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s))

epoch: 0 loss: tensor(0.3331)
epoch: 1 loss: tensor(0.2505)
epoch: 2 loss: tensor(0.2506)
epoch: 3 loss: tensor(0.2494)
epoch: 4 loss: tensor(0.2498)
epoch: 5 loss: tensor(0.2473)
epoch: 6 loss: tensor(0.2495)
epoch: 7 loss: tensor(0.2500)
epoch: 8 loss: tensor(0.2438)
epoch: 9 loss: tensor(0.2500)


Testing the RBM

In [None]:
test_loss = 0
s = 0.  

for id_user in range(nb_users):
  v = training_set[id_user: id_user + 1]  
  vt = test_set[id_user: id_user + 1]  

  if len(vt[vt>=0]) > 0:
    _, h = rbm.sample_h(v)
    _, v = rbm.sample_v(h)
    test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
    s += 1.

print('Test loss: ' + str(test_loss/s))

Test loss: tensor(0.2354)
