Explanation: https://medium.com/machine-learning-researcher/boltzmann-machine-c2ce76d94da5

#Bernoulli-RBM-Recommender-System

**Short introduction:**

**1. Connect to Drive**

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#%cd '/content/drive/My Drive/music_recommender/data'

/content/drive/My Drive/music_recommender/data


**2. Import Packages**

In [2]:
# Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.model_selection import train_test_split

**3. Load data and reducing to subset**

In [None]:
# load data
dtf_origin = pd.read_csv("train.csv")

# just looking at heavy users with more than 1250 songs listend
dtf_origin = dtf_origin.groupby('user_id').filter(lambda x : len(x)>1250)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7558829,0,1480398097,136334560,14581358,23,19910101,1,0,268,1,0,2592,129,24,0
7558830,0,1479973465,136591154,14617606,2,20161118,0,0,974,0,0,9028,63401,22,0
7558831,0,1479993377,136647128,14624304,4,20161012,0,2,175,1,0,393,617,26,0
7558832,0,1479936918,136647132,14624304,4,20161012,0,0,126,1,1,4507,221,29,1


In [None]:
# create users
users = dtf_origin[["user_id","user_gender","user_age"]]
users = users.drop_duplicates(subset=['user_id'])
users = users.sort_values(by = ['user_id'],ignore_index=True)


Unnamed: 0,user_id,user_gender,user_age
0,0,0,26
1,1,1,27
2,2,0,19
3,3,0,20
4,4,1,19
...,...,...,...
1171,1171,0,26
1172,1172,1,29
1173,1173,1,29
1174,1174,1,26


In [None]:
# create items
songs = dtf_origin[["media_id","genre_id","release_date","artist_id"]]
songs = songs.drop_duplicates(subset=['media_id'])
songs["old"] = songs["release_date"].apply(lambda x: 1 if x < 20140101 else 0) #Songs older as 2014 are count as "old".
songs.drop('release_date', axis=1, inplace=True)
songs = songs.sort_values(by = ['media_id'], ignore_index=True)
songs.insert(0, 'new_media_id', range(0, 0 + len(songs)))

Unnamed: 0,new_media_id,media_id,genre_id,artist_id,old
0,0,200224,25471,671,1
1,1,200226,25471,671,1
2,2,200230,25471,671,1
3,3,200559,9,15277,1
4,4,200756,7,54158,1
...,...,...,...,...,...
199383,199383,137099686,0,1651095,0
199384,199384,137099688,0,1651095,0
199385,199385,137100814,0,6864161,0
199386,199386,137148948,0,382937,0


In [None]:
#create ratings
ratings = dtf_origin[["user_id","media_id","is_listened","ts_listen"]]
ratings = ratings.sort_values(by = ['user_id'],ignore_index=True)
ratings = pd.merge(ratings, songs, how="left", left_on= 'media_id', right_on= "media_id")
ratings = ratings.drop(['media_id', "genre_id", "artist_id", "old"], axis=1)

Unnamed: 0,user_id,is_listened,ts_listen,new_media_id
0,0,1,1478799317,176641
1,0,1,1479483851,42174
2,0,1,1480286605,100761
3,0,1,1479483572,42183
4,0,1,1479482673,42182
...,...,...,...,...
2255499,1175,0,1480006853,184473
2255500,1175,0,1480012804,184478
2255501,1175,1,1480012826,184479
2255502,1175,1,1479756009,156346


**4. Descriptive analysis**

**5. Split into train & test set**

In [None]:
# Split & preparing the training set and the test set
full_dataset = ratings[["user_id","new_media_id","is_listened","ts_listen"]]
training_set, test_set = train_test_split(full_dataset, test_size=0.2)
training_set = training_set.sort_values(by = ['user_id'],ignore_index=True)
test_set = test_set.sort_values(by = ['user_id'],ignore_index=True)
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

**6. Create User-Song matrix**

In [None]:
# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_songs = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [None]:
# Converting the data into an array with users in lines and movies in columns
def convert(data):
    new_data = []
    for id_users in range(0, nb_users):
        id_songs = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_songs)
        ratings[id_songs - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

In [None]:
# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [None]:
# Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)
training_set[training_set == 0] = -1
test_set[test_set == 0] = -1

**7. Train and test RBM** 

In [None]:
# Creating the architecture of the Neural Network
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)
    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    def grad(self, v0, vk, ph0, phk):
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)
    # Training the RBM
    def train(self, batch_size, nb_epoch):
        for epoch in range(1, nb_epoch + 1):
            train_loss = 0
            s = 0.
            for id_user in range(0, nb_users - batch_size, batch_size):
                vk = training_set[id_user:id_user+batch_size]
                v0 = training_set[id_user:id_user+batch_size]
                ph0,_ = self.sample_h(v0)
                for k in range(10):
                    _,hk = self.sample_h(vk)
                    _,vk = self.sample_v(hk)
                    vk[v0<0] = v0[v0<0]
                phk,_ = self.sample_h(vk)
                self.grad(v0, vk, ph0, phk)
                train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
                s += 1.
            print('epoch: '+str(epoch)+' Train loss: '+str(train_loss/s))
    # Testing the RBM
    def test(self):
        test_loss = 0
        s = 0.
        for id_user in range(nb_users):
            v = training_set[id_user:id_user+1]
            vt = test_set[id_user:id_user+1]
            if len(vt[vt>=0]) > 0:
                _,h = self.sample_h(v) #hidden
                _,v = self.sample_v(h) #visible
                test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
                s += 1.
        print('test loss: '+str(test_loss/s))
      


In [None]:
# training
nv = len(training_set[0])
nh = 200
batch_size = 128
nb_epoch = 30
rbm = RBM(nv, nh)
rbm.train(batch_size, nb_epoch)

epoch: 1 Train loss: tensor(0.3596)
epoch: 2 Train loss: tensor(0.1510)
epoch: 3 Train loss: tensor(0.0843)
epoch: 4 Train loss: tensor(0.0584)
epoch: 5 Train loss: tensor(0.0407)
epoch: 6 Train loss: tensor(0.0279)
epoch: 7 Train loss: tensor(0.0212)
epoch: 8 Train loss: tensor(0.0169)
epoch: 9 Train loss: tensor(0.0141)
epoch: 10 Train loss: tensor(0.0155)
epoch: 11 Train loss: tensor(0.0214)
epoch: 12 Train loss: tensor(0.0203)
epoch: 13 Train loss: tensor(0.0168)
epoch: 14 Train loss: tensor(0.0143)
epoch: 15 Train loss: tensor(0.0119)
epoch: 16 Train loss: tensor(0.0106)
epoch: 17 Train loss: tensor(0.0095)
epoch: 18 Train loss: tensor(0.0085)
epoch: 19 Train loss: tensor(0.0076)
epoch: 20 Train loss: tensor(0.0072)
epoch: 21 Train loss: tensor(0.0067)
epoch: 22 Train loss: tensor(0.0088)
epoch: 23 Train loss: tensor(0.0087)
epoch: 24 Train loss: tensor(0.0077)
epoch: 25 Train loss: tensor(0.0071)
epoch: 26 Train loss: tensor(0.0068)
epoch: 27 Train loss: tensor(0.0062)
epoch: 28 

In [None]:
rbm.test()

test loss: tensor(0.0812)


**8. Predict if next song (original test set) is listend**

In [None]:
# Load test.csv and transform it to an identical subset as train
dtf_origin_test = pd.read_csv("test.csv")
dtf_origin_test = pd.merge(dtf_origin_test, users, how="inner", left_on= 'user_id', right_on= "user_id")
dtf_origin_test = dtf_origin_test.sort_values(by = ['user_id'],ignore_index=True)
dtf_origin_test.insert(0, 'new_id_user', range(0, 0 + len(dtf_origin_test)))
dtf_origin_test = pd.merge(dtf_origin_test, songs, how="left", left_on= 'media_id', right_on= "media_id")
dtf_origin_test

# this set contains only user which we took also in the training part. Challenging: Test set contains songs that were not on the train part.

Unnamed: 0,new_id_user,sample_id,genre_id_x,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,...,user_gender_x,user_id,artist_id_x,user_age_x,user_gender_y,user_age_y,new_media_id,genre_id_y,artist_id_y,old
0,0,14561,0,1480614542,117678828,12219078,1,20151012,0,2,...,0,0,468920,26,0,26,161946.0,0.0,468920.0,0.0
1,1,6026,0,1480614639,103870206,10804138,1,20150716,0,2,...,1,1,4052518,27,1,27,146797.0,0.0,4052518.0,0.0
2,2,9627,0,1480609115,110692486,11516526,1,20151204,1,2,...,0,2,5328949,19,0,19,154758.0,0.0,5328949.0,0.0
3,3,6064,3645,1480629074,4628432,425785,1,20091102,0,0,...,0,3,57456,20,0,20,,,,
4,4,8065,14,1480571507,71152679,6985314,1,20130930,0,0,...,1,4,1435566,19,1,19,103847.0,14.0,1435566.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,1171,9772,7054,1480608734,75305754,7453206,1,20140221,2,1,...,0,1171,298954,26,0,26,112076.0,7054.0,298954.0,0.0
1172,1172,4799,4310,1480619367,67575448,6615707,1,20130524,0,0,...,1,1172,379709,29,1,29,95443.0,4310.0,379709.0,1.0
1173,1173,13852,0,1480508731,120007466,12510040,1,20160304,0,0,...,1,1173,5021724,29,1,29,165396.0,0.0,5021724.0,0.0
1174,1174,14656,734,1480536975,3747667,351800,1,20090710,2,1,...,1,1174,48,26,1,26,36740.0,734.0,48.0,1.0


In [None]:
#predict if next song y is listend by user x
def predict(new_id_user, new_media_id):
  v = training_set[new_id_user:new_id_user+1] #new_id_user
  _,h = rbm.sample_h(v)
  _,v = rbm.sample_v(h)
  prediction = v[0:1, new_media_id:new_media_id+1] #new_media_id
  print ("Will the song be listend over 30 sec?:  ", prediction)

predict(0,161946)

Will the song be listend over 30 sec?:   tensor([[0.]])


In [None]:
#Have a look at whole tensor (prediction one and zero for all 200'000 songs for user x)
v = training_set[new_id_user:new_id_user+1] #new_id_user
_,h = rbm.sample_h(v)
_,v = rbm.sample_v(h)

torch.set_printoptions(profile="full")
v #predicted outcome for user i () of
# v.numel() #count predictions (=songs)

tensor([[1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
         0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0.,
         1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
         1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1.,
         1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1.,
         1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0.,
         1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
         1., 1., 1., 1., 1.,