### Recommender Systems 
-- predicting if a user like a movie or not. 

In [3]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.parallel
import torch.utils.data
from torch.autograd import Variable

### Importing datasets 

-- details regarding the dataset is given in readme file attached  

In [4]:
movies = pd.read_csv('DataSet/ml-1m/movies.dat', sep = '::', header = None, engine= 'python', 
                     encoding='latin-1')
print(movies.head())
print(movies.shape)

   0                                   1                             2
0  1                    Toy Story (1995)   Animation|Children's|Comedy
1  2                      Jumanji (1995)  Adventure|Children's|Fantasy
2  3             Grumpier Old Men (1995)                Comedy|Romance
3  4            Waiting to Exhale (1995)                  Comedy|Drama
4  5  Father of the Bride Part II (1995)                        Comedy
(3883, 3)


In [5]:
user = pd.read_csv('DataSet/ml-1m/users.dat', sep = '::', header = None, engine= 'python', 
                     encoding='latin-1')
print(user.head())
print(user.shape)

   0  1   2   3      4
0  1  F   1  10  48067
1  2  M  56  16  70072
2  3  M  25  15  55117
3  4  M  45   7  02460
4  5  M  25  20  55455
(6040, 5)


In [6]:
rating = pd.read_csv('DataSet/ml-1m/ratings.dat', sep = '::', header = None, engine= 'python', 
                     encoding='latin-1')
print(rating.head())
print(rating.shape)

   0     1  2          3
0  1  1193  5  978300760
1  1   661  3  978302109
2  1   914  3  978301968
3  1  3408  4  978300275
4  1  2355  5  978824291
(1000209, 4)


In [7]:
# we are taking our training set from u1.base of 'ml - 100k' data and u1.test as test set
train = pd.read_csv('DataSet/ml-100k/u1.base', sep= '\t')
train.head()
# User||MoviesID||Rating||Timestamp

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [8]:
train = np.array(train, dtype='int')
test = pd.read_csv('DataSet/ml-100k/u1.test', sep= '\t')
test = np.array(test, dtype='int')

### Data Preprocessing 

Number of users and movies 

In [9]:
nb_users = int(max(max(train[:, 0]), max(test[:,0])))
nb_movies = int(max(max(train[:, 1]), max(test[:,1])))
print(nb_users)
print(nb_movies)

943
1682


Preparing the data into an array with user as rows and movies as columns so as to put in our BMs

In [10]:
def convert(data):
    new_df = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:, 1][data[:, 0] == id_users]
        id_rating = data[:, 2][data[:, 0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies-1] = id_rating
        new_df.append(list(ratings))
    return new_df

In [11]:
train = convert(train)
test = convert(test)
# len(test[0])

Converting into tensor for pytorch

In [12]:
train = torch.FloatTensor(train)
test = torch.FloatTensor(test)


In [13]:
# converting all the rating to binary because we are predicting whether a user like a movie or not
# Our RBM predict the values of unrated movies.
train[train == 0] = -1
train[train == 1] = 0
train[train == 2] = 0
train[train >= 3] = 1
test[test == 0] = -1
test[test == 1] = 0
test[test == 2] = 0
test[test >= 3] = 1

In [14]:
train[0]

tensor([-1.,  1.,  1.,  ..., -1., -1., -1.])

### Building RBM

In [15]:
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)

    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    # the torch.bernaulli return a tensor containing all the binary value of node depending upon the value of probability. whether they 
    # are activated or not.

    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    # this function return the values of visible nodes in a form of vector or tensor

    # contrastive divergence of Gibbs Sampling 
    #  vk --> visible node obtained after k CDs 
    def train(self, v0, vk, ph0, phk):
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)   

In [16]:
nv = len(train[0])
# nh depends upon our size
nh = 500
batch_size = 50

In [17]:
rbm = RBM(nv, nh)
rbm.W.shape

torch.Size([500, 1682])

Training 

In [18]:
epoch = 20
for i in range(0, epoch):
    train_loss = 0
    cnt = 0. # cnt in float
    for id_users in range(0, nb_users - batch_size, batch_size):
        vk = train[id_users:id_users+batch_size]
        # v0 is the target which is used for comparision
        v0 = train[id_users:id_users+batch_size]
        ph0,_ = rbm.sample_h(v0)
        for k in range(20):
            _,hk = rbm.sample_h(vk)
            _,vk = rbm.sample_v(hk)
            # updation should not be done on the movies which user doesn't watch
            vk[v0 < 0] = v0[v0<0]
        phk,_ = rbm.sample_h(vk)   
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(vk[vk>=0] - v0[v0>=0]))
        cnt += 1.
    print(f"epoch : {i} || loss: {train_loss/cnt} ")   # printing the cumlative normalised loss 


epoch : 0 || loss: 0.33229896426200867 
epoch : 1 || loss: 0.2547893524169922 
epoch : 2 || loss: 0.24982108175754547 
epoch : 3 || loss: 0.24887531995773315 
epoch : 4 || loss: 0.24731135368347168 
epoch : 5 || loss: 0.24653227627277374 
epoch : 6 || loss: 0.24604511260986328 
epoch : 7 || loss: 0.24607239663600922 
epoch : 8 || loss: 0.24602049589157104 
epoch : 9 || loss: 0.24376767873764038 
epoch : 10 || loss: 0.24484126269817352 
epoch : 11 || loss: 0.24502597749233246 
epoch : 12 || loss: 0.24501731991767883 
epoch : 13 || loss: 0.24506384134292603 
epoch : 14 || loss: 0.24542362987995148 
epoch : 15 || loss: 0.24472488462924957 
epoch : 16 || loss: 0.2463141828775406 
epoch : 17 || loss: 0.24213004112243652 
epoch : 18 || loss: 0.2434638887643814 
epoch : 19 || loss: 0.24396030604839325 


Testing

In [19]:
test_loss = 0
cnt = 0. # cnt in float
for id_users in range(nb_users):
    v = train[id_users:id_users+1]
    # vt is the target which is used for comparision
    vt = test[id_users:id_users+1]
    if len(vt[vt >= 0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(v[vt>=0] - vt[vt>=0]))
        cnt += 1.
print(f"test loss: {test_loss/cnt} ")   # printing the cumlative normalised loss 


test loss: 0.2472657859325409 
