# RMB

### Setup notebook

In [None]:
# Importing the libraries
import sys, os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.functional as F

from mymods.lauthom import *


### Data

In [None]:
get_path('*/*', 'movies')
get_path('*/*', 'users')
get_path('*/*', 'ratings')

In [None]:
def read_file(filename, **kwargs):
    """Get file path and read file"""
    from pathlib import Path
    fpath = list(Path('../../').glob('*/*/' + str(filename)))[0]
    return pd.read_csv(fpath, **kwargs)

In [None]:
# Importing the dataset
movies = read_file('movies.dat', sep='::', header=None, engine='python', encoding='latin-1')
users = read_file('users.dat', sep='::', header=None, engine='python', encoding='latin-1')
ratings = read_file('ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')

In [None]:
movies.head()

### Train test sets

In [None]:
get_path('*/*', 'u1')

In [None]:
training_set = read_file('../../_data/ml-100k/u1.base', delimiter='\t')
test_set = read_file('../../_data/ml-100k/u1.test', delimiter='\t')

In [None]:
# Preparing the training set and the test set
# last feature is set type; 0 = train, 1 = test
# concatenate assumes axis = 0
# hstack assumes axis = 1 unless inputs are 1d, then axis = 0
# vstack assumes axis = 0 after adding an axis if inputs are 1d
# append flattens array


training_set = np.array(training_set, dtype='int')
training_set.head()


test_set = np.array(test_set, dtype='int')
test_set.shape

In [None]:
# example ratings for user = 1
# test set contains all movie ratings for all users
# train set contains some blanks, to infer
user_id = 1
trn = training_set[training_set[:, 0] == user_id, 2:4]
tst = test_set[test_set[:, 0] == user_id, 2:4]

# check relation train - test
for t in tst[:, 1][:10]:
    print(t, t in trn, t in tst)

#### Maximum # users and movies

In [None]:
nb_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1]), max(test_set[:, 1])))
nb_users, nb_movies

### Converting the data into an array 

 - users in lines
 - movies in columns

In [None]:
# Converting the data into an array with users in lines and movies in columns
# return nested list: user_list(movie_rating_list) - format required by pytorch
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0]==id_users]
        id_ratings = data[:,2][data[:,0]==id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

In [None]:
training_set
test_set.shape

#### Sanity check train and test set

In [None]:
# test if both sets are equal, such that contain all users and all movies
assert np.array(training_set).shape==np.array(test_set).shape

# number of movies/ratings training set
print(len(training_set[0]))

# number of movies/ratings test set
print(len(test_set[0]))

### 

In [None]:
# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [None]:
training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1

test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

### Build model

args: nv = visual neurons(input layer), nh = hidden neurons

In [None]:
class RBM():
    def __init__(self, nv, nh):
        """initialize weights with random normal"""
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh) 
        self.b = torch.randn(1, nv) 
        
    def probability(self, activation):
        """get sigmoid probability and sample from a Bernoulli distribution"""
        sigmoid = torch.sigmoid(activation)
        return sigmoid, torch.bernoulli(sigmoid)
    
    def activation(self, input, weight, bias):
        """get activation"""
        wi = torch.mm(input, weight)
        return wi + bias.expand_as(wi)
    
    def sample_h(self, x):
        """get prob and binairy activation for hidden layer"""
        return self.probability(self.activation(x, self.W.t(), self.a))
    
    def sample_v(self, y):
        """get prob and binairy activation for visual layer"""
        return self.probability(self.activation(y, self.W, self.b))
    
    def train(self, v0, vk, ph0, phk):
        """update weights for state 0 to k"""
        self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)

In [None]:
N_VIS = len(training_set[0])
N_HID = 100
N_EPOCH = 20
BATCH_SIZE = 100
N_WALKS = 15

rbm = RBM(N_VIS, N_HID)
print(rbm)

### 

In [None]:
# Training the RBM
# train on rated movies - exclude unrated movies (ratings with value -1)
# loss = train set without blanks - test set without blanks from train set
for epoch in np.arange(N_EPOCH)+1:
    
    # average train_loss /users
    cum_train_loss = 0
    u = 0.
    print(list(range(nb_users - BATCH_SIZE, BATCH_SIZE)))
    
    # batchwise
    for u, id_user in enumerate(range(nb_users - BATCH_SIZE, BATCH_SIZE)):
        print(u)
        vk = training_set[id_user:id_user+BATCH_SIZE]
        v0 = vk.clone() # training_set[id_user:id_user+batch_size]
        ph0,_ = rbm.sample_h(v0)
        
        # optimize by blind/random walk
        # divergence
        # get bernoulli 
        for k in range(N_WALKS):
            _,hk = rbm.sample_h(vk)
            _,vk = rbm.sample_v(hk)
            vk[v0 < 0] = v0[v0 < 0] # do not update vk for unrated movies
        
        # get converged hidden probs
        phk,_ = rbm.sample_h(vk) 
        
        # update weights
        rbm.train(v0, vk, ph0, phk)
        
        # cum train_loss/users
        # vk is v_hat or inferred rating of rated movies
        # loss = difference in ratings; 0-1, 1-0, 0-0, 1-1
        # 25% loss = 1 out of 4 movies are misqualified
        cum_train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
#         u += 1. # number of users to average out the cum loss
        print('epoch: {} train loss: {} u: {}'.format(epoch, cum_train_loss, u))
        print('epoch: ' + str(epoch)+' loss: ' + str(cum_train_loss/u))

### 

In [None]:
# Testing the RBM
# test set contains all users and all ratings
# train set has same shape as test set, but contains unrated movies for inferrence
cum_test_loss = 0
u = 0.

for id_user in range(nb_users):
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    
    # infer unrated movies in train set if test set contains rated movies
    if len(vt[vt >= 0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        
    # loss = train set with true & inferred ratings -/- test set with all true ratings
    u += 1.
    cum_test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0])) #/u
        
    print('test loss: ' + str(cum_test_loss.item()))