# Recommendation System using Content Based Filtering and RBM.

In [None]:
! pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095461 sha256=8c6376509923fd62e2c178f7b89c5de97f2b87dadcccf869623e574dc173f4dc
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

In [None]:
import pandas as pd
import numpy as np
import math
import numpy as np
import os
import csv
import random
import heapq
import re
from collections import defaultdict

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import AlgoBase
from surprise import PredictionImpossible
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import KNNBaseline

## Data Preparation

In [None]:
from google.colab import files

print("Please upload your kaggle.json file.")
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Please upload your kaggle.json file.


Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 71 bytes


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Get MovieLens Dataset.

In [None]:
! cp ./kaggle.json ~/.kaggle/
! kaggle datasets download -d aprabowo/indonesia-tourism-destination

Downloading indonesia-tourism-destination.zip to /content
  0% 0.00/158k [00:00<?, ?B/s]
100% 158k/158k [00:00<00:00, 103MB/s]


In [None]:
! unzip ./indonesia-tourism-destination.zip

Archive:  ./indonesia-tourism-destination.zip
  inflating: package_tourism.csv     
  inflating: tourism_rating.csv      
  inflating: tourism_with_id.csv     
  inflating: user.csv                


In [None]:
tourism_df = pd.read_csv("./drive/MyDrive/Dataset/preprocessed_indonesia_tourism_destination_df.csv")
tourism_2_df = pd.read_csv("./tourism_with_id.csv")
rating_df = pd.read_csv("./tourism_rating.csv")

In [None]:
tourism_df.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Lat,Long
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,-6.137645,106.817125
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,-6.125312,106.833538
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,-6.302446,106.895156
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,-6.12419,106.839134


In [None]:
tourism_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Place_Id     437 non-null    int64  
 1   Place_Name   437 non-null    object 
 2   Description  437 non-null    object 
 3   Category     437 non-null    object 
 4   City         437 non-null    object 
 5   Price        437 non-null    int64  
 6   Rating       437 non-null    float64
 7   Lat          437 non-null    float64
 8   Long         437 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 30.9+ KB


In [None]:
rating_df.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [None]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        10000 non-null  int64
 1   Place_Id       10000 non-null  int64
 2   Place_Ratings  10000 non-null  int64
dtypes: int64(3)
memory usage: 234.5 KB


In [None]:
rating_df.describe()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
count,10000.0,10000.0,10000.0
mean,151.2927,219.4164,3.0665
std,86.137374,126.228335,1.379952
min,1.0,1.0,1.0
25%,77.0,108.75,2.0
50%,151.0,220.0,3.0
75%,226.0,329.0,4.0
max,300.0,437.0,5.0


## Data Modelling.

In [None]:
class TourismDataset:

    movieID_to_name = {}
    name_to_movieID = {}
    ratingsPath = './tourism_rating.csv'
    tourismPath = "./drive/MyDrive/Dataset/preprocessed_indonesia_tourism_destination_df.csv"

    def loadData(self):
        self.tourismID_to_name = {}
        self.name_to_tourismID = {}

        reader = Reader(line_format='user item rating', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.tourismPath, newline='', encoding='ISO-8859-1') as csvfile:
          tourismReader = csv.reader(csvfile)
          next(tourismReader)  # Skip header line
          for row in tourismReader:
              tourismID = int(row[0])
              tourismName = row[1]
              self.tourismID_to_name[tourismID] = tourismName
              self.name_to_tourismID[tourismName] = tourismID

        return ratingsDataset

    def getPrice(self):
        prices = defaultdict(int)
        with open(self.tourismPath, newline='', encoding='ISO-8859-1') as csvfile:
          movieReader = csv.reader(csvfile)
          next(movieReader)  # Skip header line
          for row in movieReader:
            tourismID = int(row[0])
            price = row[5]
            prices[tourismID] = int(price)
        return prices

    def getLat(self):
        latitudes = defaultdict(int)
        with open(self.tourismPath, newline='', encoding='ISO-8859-1') as csvfile:
          movieReader = csv.reader(csvfile)
          next(movieReader)  # Skip header line
          for row in movieReader:
            tourismID = int(row[0])
            latitude = row[7]
            latitudes[tourismID] = float(latitude)
        return latitudes

    def getLong(self):
        longitudes = defaultdict(int)
        with open(self.tourismPath, newline='', encoding='ISO-8859-1') as csvfile:
          movieReader = csv.reader(csvfile)
          next(movieReader)  # Skip header line
          for row in movieReader:
            tourismID = int(row[0])
            longitude = row[8]
            longitudes[tourismID] = float(longitude)
        return longitudes


    def getTravellingPlaceName(self, tourismID):
        if tourismID in self.tourismID_to_name:
            return self.tourismID_to_name[tourismID]
        else:
            return ""

    def getTravellingPlaceID(self, tourismName):
        if tourismName in self.name_to_tourismID:
            return self.name_to_tourismID[tourismName]
        else:
            return 0

    def getUserRatings(self, user):
        userRatings = []
        hitUser = False
        with open(self.ratingsPath, newline='') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            for row in ratingReader:
                userID = int(row[0])
                if (user == userID):
                    tourismID = int(row[1])
                    rating = float(row[2])
                    userRatings.append((tourismID, rating))
                    hitUser = True
                if (hitUser and (user != userID)):
                    break

        return userRatings

In [None]:
class EvaluatedAlgorithm:

    def __init__(self, algorithm, name):
        self.algorithm = algorithm
        self.name = name

    def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
        metrics = {}
        # Compute accuracy
        if (verbose):
            print("Evaluating accuracy...")
        self.algorithm.fit(evaluationData.GetTrainSet())
        predictions = self.algorithm.test(evaluationData.GetTestSet())
        metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
        metrics["MAE"] = RecommenderMetrics.MAE(predictions)
        return metrics

    def GetName(self):
        return self.name

    def GetAlgorithm(self):
        return self.algorithm

In [None]:
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops

tf.compat.v1.disable_eager_execution()

class RBM(object):

    def __init__(self, visibleDimensions, epochs=20, hiddenDimensions=50, ratingValues=10, learningRate=0.001, batchSize=100):

        self.visibleDimensions = visibleDimensions
        self.epochs = epochs
        self.hiddenDimensions = hiddenDimensions
        self.ratingValues = ratingValues
        self.learningRate = learningRate
        self.batchSize = batchSize


    def Train(self, X):

        ops.reset_default_graph()

        self.MakeGraph()

        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)

        for epoch in range(self.epochs):
            np.random.shuffle(X)

            trX = np.array(X)
            for i in range(0, trX.shape[0], self.batchSize):
                self.sess.run(self.update, feed_dict={self.X: trX[i:i+self.batchSize]})

            print("Trained epoch ", epoch)


    def GetRecommendations(self, inputUser):

        hidden = tf.nn.sigmoid(tf.matmul(self.X, self.weights) + self.hiddenBias)
        visible = tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(self.weights)) + self.visibleBias)

        feed = self.sess.run(hidden, feed_dict={ self.X: inputUser} )
        rec = self.sess.run(visible, feed_dict={ hidden: feed} )
        return rec[0]

    def MakeGraph(self):

        tf.set_random_seed(0)

        # Create variables for the graph, weights and biases
        self.X = tf.placeholder(tf.float32, [None, self.visibleDimensions], name="X")

        # Initialize weights randomly
        maxWeight = -4.0 * np.sqrt(6.0 / (self.hiddenDimensions + self.visibleDimensions))
        self.weights = tf.Variable(tf.random_uniform([self.visibleDimensions, self.hiddenDimensions], minval=-maxWeight, maxval=maxWeight), tf.float32, name="weights")

        self.hiddenBias = tf.Variable(tf.zeros([self.hiddenDimensions], tf.float32, name="hiddenBias"))
        self.visibleBias = tf.Variable(tf.zeros([self.visibleDimensions], tf.float32, name="visibleBias"))

        # Perform Gibbs Sampling for Contrastive Divergence, per the paper we assume k=1 instead of iterating over the
        # forward pass multiple times since it seems to work just fine

        # Forward pass
        # Sample hidden layer given visible...
        # Get tensor of hidden probabilities
        hProb0 = tf.nn.sigmoid(tf.matmul(self.X, self.weights) + self.hiddenBias)
        # Sample from all of the distributions
        hSample = tf.nn.relu(tf.sign(hProb0 - tf.random_uniform(tf.shape(hProb0))))
        # Stitch it together
        forward = tf.matmul(tf.transpose(self.X), hSample)

        # Backward pass
        # Reconstruct visible layer given hidden layer sample
        v = tf.matmul(hSample, tf.transpose(self.weights)) + self.visibleBias

        # Build up our mask for missing ratings
        vMask = tf.sign(self.X) # Make sure everything is 0 or 1
        vMask3D = tf.reshape(vMask, [tf.shape(v)[0], -1, self.ratingValues]) # Reshape into arrays of individual ratings
        vMask3D = tf.reduce_max(vMask3D, axis=[2], keepdims=True) # Use reduce_max to either give us 1 for ratings that exist, and 0 for missing ratings

        # Extract rating vectors for each individual set of 10 rating binary values
        v = tf.reshape(v, [tf.shape(v)[0], -1, self.ratingValues])
        vProb = tf.nn.softmax(v * vMask3D) # Apply softmax activation function
        vProb = tf.reshape(vProb, [tf.shape(v)[0], -1]) # And shove them back into the flattened state. Reconstruction is done now.
        # Stitch it together to define the backward pass and updated hidden biases
        hProb1 = tf.nn.sigmoid(tf.matmul(vProb, self.weights) + self.hiddenBias)
        backward = tf.matmul(tf.transpose(vProb), hProb1)

        # Now define what each epoch will do...
        # Run the forward and backward passes, and update the weights
        weightUpdate = self.weights.assign_add(self.learningRate * (forward - backward))
        # Update hidden bias, minimizing the divergence in the hidden nodes
        hiddenBiasUpdate = self.hiddenBias.assign_add(self.learningRate * tf.reduce_mean(hProb0 - hProb1, 0))
        # Update the visible bias, minimizng divergence in the visible results
        visibleBiasUpdate = self.visibleBias.assign_add(self.learningRate * tf.reduce_mean(self.X - vProb, 0))

        self.update = [weightUpdate, hiddenBiasUpdate, visibleBiasUpdate]

In [None]:
class RBMAlgorithm(AlgoBase):

    def __init__(self, epochs=20, hiddenDim=100, learningRate=0.001, batchSize=100, sim_options={}):
        AlgoBase.__init__(self)
        self.epochs = epochs
        self.hiddenDim = hiddenDim
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.tourism_dataset = TourismDataset()
        self.tourism_dataset.loadData()
        self.stoplist = []

    def buildStoplist(self, trainset):
        self.stoplistLookup = {}
        for iiid in trainset.all_items():
            self.stoplistLookup[iiid] = False
            tourismID = trainset.to_raw_iid(iiid)
            title = self.tourism_dataset.getTravellingPlaceName(int(tourismID))
            if (title):
                title = title.lower()
                for term in self.stoplist:
                    if term in title:
                        print ("Blocked ", title)
                        self.stoplistLookup[iiid] = True

    def softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        self.buildStoplist(trainset)

        numUsers = trainset.n_users
        numItems = trainset.n_items

        trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)

        for (uid, iid, rating) in trainset.all_ratings():
            if not self.stoplistLookup[iid]:
                adjustedRating = int(float(rating)*2.0) - 1
                trainingMatrix[int(uid), int(iid), adjustedRating] = 1

        # Flatten to a 2D array, with nodes for each possible rating type on each possible item, for every user.
        trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])

        # Create an RBM with (num items * rating values) visible nodes
        rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
        rbm.Train(trainingMatrix)

        self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
        for uiid in range(trainset.n_users):
            if (uiid % 50 == 0):
                print("Processing user ", uiid)
            recs = rbm.GetRecommendations([trainingMatrix[uiid]])
            recs = np.reshape(recs, [numItems, 10])

            for itemID, rec in enumerate(recs):
                # The obvious thing would be to just take the rating with the highest score:
                #rating = rec.argmax()
                # ... but this just leads to a huge multi-way tie for 5-star predictions.
                # The paper suggests performing normalization over K values to get probabilities
                # and take the expectation as your prediction, so we'll do that instead:
                normalized = self.softmax(rec)
                rating = np.average(np.arange(10), weights=normalized)
                self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5

        return self


    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        rating = self.predictedRatings[u, i]

        if (rating < 0.001):
            raise PredictionImpossible('No valid prediction exists.')

        return rating

    def GetName(self):
      return "RBM Algorithm"

In [None]:
class ContentKNNAlgorithm(AlgoBase):

    def __init__(self, k=40, sim_options={}):
        AlgoBase.__init__(self)
        self.k = k

    def GetName(self):
      return "Content KNN Algorithm"

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        # Compute item similarity matrix based on content attributes

        # Load up genre vectors for every movie
        tourism_dataset = TourismDataset()
        latitude = tourism_dataset.getLat()
        longitude = tourism_dataset.getLong()
        price = tourism_dataset.getPrice()

        print("Computing content-based similarity matrix...")

        # Compute genre distance for every movie combination as a 2x2 matrix
        self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))

        for thisRating in range(self.trainset.n_items):
            if (thisRating % 100 == 0):
                print(thisRating, " of ", self.trainset.n_items)
            for otherRating in range(thisRating+1, self.trainset.n_items):
                thisMovieID = int(self.trainset.to_raw_iid(thisRating))
                otherMovieID = int(self.trainset.to_raw_iid(otherRating))
                latitudeSimilarity = self.computeLatitudeSimilarity(thisMovieID, otherMovieID, latitude)
                longitudeSimilarity = self.computeLongitudeSimilarity(thisMovieID, otherMovieID, longitude)
                priceSimilarity = self.computePriceSimilarity(thisMovieID, otherMovieID, price)

                self.similarities[thisRating, otherRating] = latitudeSimilarity * longitudeSimilarity * priceSimilarity
                self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]

        print("...done.")

        return self

    # def computeCategorySimilarity(self, movie1, movie2, genres):
    #     genres1 = genres[movie1]
    #     genres2 = genres[movie2]
    #     sumxx, sumxy, sumyy = 0, 0, 0
    #     for i in range(len(genres1)):
    #         x = genres1[i]
    #         y = genres2[i]
    #         sumxx += x * x
    #         sumyy += y * y
    #         sumxy += x * y

    #     return sumxy/math.sqrt(sumxx*sumyy)

    def computePriceSimilarity(self, place_1, place_2, prices):
        diff = abs(prices[place_1] - prices[place_2])
        sim = math.exp(-diff / 10.0)
        return sim

    def computeLatitudeSimilarity(self, place_1, place_2, latitude):
        diff = abs(latitude[place_1] - latitude[place_2])
        sim = math.exp(-diff / 2.0)
        return sim

    def computeLongitudeSimilarity(self, place_1, place_2, longitude):
        diff = abs(longitude[place_1] - longitude[place_2])
        sim = math.exp(-diff / 2.0)
        return sim

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        # Build up similarity scores between this item and everything the user rated
        neighbors = []
        for rating in self.trainset.ur[u]:
            similarity = self.similarities[i,rating[0]]
            neighbors.append( (similarity, rating[1]) )

        # Extract the top-K most-similar ratings
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])

        # Compute average sim score of K neighbors weighted by user ratings
        simTotal = weightedSum = 0
        for (simScore, rating) in k_neighbors:
            if (simScore > 0):
                simTotal += simScore
                weightedSum += simScore * rating

        if (simTotal == 0):
            raise PredictionImpossible('No neighbors')

        predictedRating = weightedSum / simTotal

        return predictedRating


In [None]:
class HybridAlgorithm(AlgoBase):

    def __init__(self, algorithms, weights, sim_options={}):
        AlgoBase.__init__(self)
        self.algorithms = algorithms
        self.weights = weights

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        for algorithm in self.algorithms:
            algorithm.fit(trainset)

        return self

    def estimate(self, u, i):

        sumScores = 0
        sumWeights = 0

        for idx in range(len(self.algorithms)):
            sumScores += self.algorithms[idx].estimate(u, i) * self.weights[idx]
            sumWeights += self.weights[idx]

        return sumScores / sumWeights

In [None]:
class EvaluationData:

    def __init__(self, data):
        #Build a full training set for evaluating overall properties
        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        #Build a 75/25 train/test split for measuring accuracy
        self.trainSet, self.testSet = train_test_split(data, test_size=.1, random_state=100)

        #Build a "leave one out" train/test split for evaluating top-N recommenders
        #And build an anti-test-set for building predictions
        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(data):
            self.LOOCVTrain = train
            self.LOOCVTest = test

        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        #Compute similarty matrix between items so we can measure diversity
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)

    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestSetForUser(self, testSubject):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                                 i in trainset.all_items() if
                                 i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetName(self):
        return self.name

    def GetAlgorithm(self):
        return self.algorithm

In [None]:
result_df = pd.DataFrame(
    columns = ["epochs", "k", "RMSE", "MAE"]
)

class Evaluator:
    algorithms = []

    def __init__(self, dataset, params):
        ed = EvaluationData(dataset)
        self.dataset = ed
        self.params = params

    def AddAlgorithm(self, algorithm, name):
        alg = EvaluatedAlgorithm(algorithm, name)
        self.algorithms.append(alg)

    def Evaluate(self, doTopN):
        global result_df
        results = {}
        for algorithm in self.algorithms:
            print("Evaluating ", algorithm.GetName(), "...")
            results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)

        # Print results
        for (name, metrics) in results.items():
          print(f"{name}: ")
          print(f"""Param 1: {self.params["epochs"]}""")
          print(f"""Param 2: {self.params["k"]}""")
          print(f"""RMSE: {metrics["RMSE"]}""")
          print(f"""MAE: {metrics["MAE"]}""")
          result_df_new = pd.DataFrame(
            {
              "epochs": self.params["epochs"],
              "k": self.params["k"],
              "RMSE": [metrics["RMSE"]],
              "MAE": [metrics["MAE"]],
            }
          )
          result_df_2 = result_df.append(result_df_new, ignore_index = True)
          result_df = result_df_2

        print("Legend:")
        print(f"""RMSE: Root Mean Squared Error. Lower values mean better accuracy.""")
        print(f"""MAE: (Mean Absolute Error. Lower values mean better accuracy.""")

    def SampleTopNRecs(self, tourism_data, testSubject=3, k=10):

        for algo in self.algorithms:
            print("\nUsing recommender ", algo.GetName())

            print("\nBuilding recommendation model...")
            trainSet = self.dataset.GetFullTrainSet()
            algo.GetAlgorithm().fit(trainSet)

            print("Computing recommendations...")
            testSet = self.dataset.GetAntiTestSetForUser(testSubject)

            predictions = algo.GetAlgorithm().test(testSet)

            recommendations = []

            print ("\nWe recommend:")
            for userID, movieID, actualRating, estimatedRating, _ in predictions:
                intMovieID = int(movieID)
                recommendations.append((intMovieID, estimatedRating))

            recommendations.sort(key=lambda x: x[1], reverse=True)

            for ratings in recommendations[:10]:
                print(tourism_data.getTravellingPlaceName(ratings[0]), ratings[1])

In [None]:
class RecommenderMetrics:

    def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)

    def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)

In [None]:
num_epochs = [10, 11, 12]
k_s = [9, 10, 11]

def LoadTourismData():
    tourism_dataset = TourismDataset()
    print("Loading movie ratings...")
    data = tourism_dataset.loadData()
    return (tourism_dataset, data)

np.random.seed(0)
random.seed(0)

for epoch in num_epochs:
  for k in k_s:
    # Load up common data set for the recommender algorithms
    (tourism_dataset, evaluationData) = LoadTourismData()

    params = {
        "epochs": epoch,
        "k": k,
    }
    # Construct an Evaluator to evaluate them
    evaluator = Evaluator(evaluationData, params)

    #Simple RBM
    SimpleRBM = RBMAlgorithm(epochs = epoch)
    #Content
    ContentKNN = ContentKNNAlgorithm(k = k)

    #Combine them
    Hybrid = HybridAlgorithm([SimpleRBM, ContentKNN], [0.5, 0.5])


    evaluator.AddAlgorithm(Hybrid, "Hybrid")

    # Fight!
    evaluator.Evaluate(False)

Loading movie ratings...
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Hybrid: 
Param 1: 10
Param 2: 9
RMSE: 1.4264615093355715
MAE: 1.2078148984547787
Legend:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
MAE: (Mean Absolute Error. Lower values mean better accuracy.
Loading movie ratings...


  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Evaluating  Hybrid ...
Evaluating accuracy...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400

  result_df_2 = result_df.append(result_df_new, ignore_index = True)


In [None]:
result_df.to_csv("result_contentknn_and_rbm.csv", sep = ';', decimal=',', index = False)
result_df

Unnamed: 0,epochs,k,RMSE,MAE
0,10,9,1.426462,1.207815
1,10,10,1.425983,1.207446
2,10,11,1.426393,1.207722
3,11,9,1.425805,1.20718
4,11,10,1.425723,1.207089
5,11,11,1.426226,1.207379
6,12,9,1.425263,1.206543
7,12,10,1.425491,1.206659
8,12,11,1.425506,1.206736


In [None]:
evaluator.SampleTopNRecs(tourism_dataset)


Using recommender  Hybrid

Building recommendation model...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Computing content-based similarity matrix...
0  of  437
100  of  437
200  of  437
300  of  437
400  of  437
...done.
Computing recommendations...

We recommend:
Geoforest Watu Payung Turunan 3.6284748792043504
Situs Warungboto 3.6070450928997966
Ledok Sambi 3.597935162524959
Nol Kilometer Jl.Malioboro 3.595413692740678
Desa Wisata Sungai Code Jogja Kota 3.584263586757965
Alun-alun Utara Keraton Yogyakarta 3.584039574964732
Pasar Kebon Empring Bintaran 3.579497576294095
Kampung Wisata Kadipaten 3.578321360523313
Bentara Budaya Yogyakarta (BBY) 3.5766457493429433
Alun Alun Selatan Yogyakarta 3.5757464629960576

Using recommender  Hybrid

Bu