In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357284 sha256=87f84c114da7eee289007a6513d52cf5d05fa8f3bdd8f5813eec07d26f064831
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [3]:
import os
import csv
import sys

from surprise import Dataset
from surprise import Reader

from collections import defaultdict

class AmazonReview:
    productID_to_name = {}
    name_to_productID = {}

    # Tập này sẽ được xử lý như sau: file Beaty.csv gốc sẽ được lọc ra các cột cần thiết (userId, productId, score, time)
    ratingsPath = '/content/drive/MyDrive/Uit recommend/Beauty-rating.csv'

    # Tập này sẽ được xử lý như sau: file Beaty.csv gốc sẽ được lọc ra các cột cần thiết (productId, title)
    productsPath = '/content/drive/MyDrive/Uit recommend/Beauty-product.csv'

    def loadAmazonReviewDataset(self):
        # Định dạng reader cho tập ratings
        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        # Tải dữ liệu ratings vào Surprise Dataset
        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        # Đọc file products để xây dựng ánh xạ productID ↔ title
        self.productID_to_name = {}
        self.name_to_productID = {}
        with open(self.productsPath, newline='', encoding='ISO-8859-1') as csvfile:
            productReader = csv.reader(csvfile)
            next(productReader)  # Bỏ qua header
            for row in productReader:
                productID = row[0]
                productName = row[1]
                self.productID_to_name[productID] = productName
                self.name_to_productID[productName] = productID

        return ratingsDataset

    def getPopularityRanks(self):
        # Tính độ phổ biến của sản phẩm từ tập ratings
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        with open(self.ratingsPath, newline='', encoding='utf-8') as csvfile:
            reviewReader = csv.reader(csvfile)
            next(reviewReader)
            for row in reviewReader:
                productID = row[1]
                ratings[productID] += 1
        rank = 1
        for productID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[productID] = rank
            rank += 1
        return rankings

    def getProductName(self, productID):
        # Lấy tên sản phẩm từ productID
        return self.productID_to_name.get(productID, "")

    def getProductID(self, productName):
        # Lấy productID từ tên sản phẩm
        return self.name_to_productID.get(productName, 0)

# Example usage:
if __name__ == "__main__":
    amazonReview = AmazonReview()
    dataset = amazonReview.loadAmazonReviewDataset()
    print("Dataset loaded successfully.")
    print("Top 5 popular products:", list(amazonReview.getPopularityRanks().items())[:5])

    # Lấy tên sản phẩm từ ID
    product_name = amazonReview.getProductName("B00012U9F8")
    print(f"Product Name: {product_name}")

    # Lấy ID từ tên sản phẩm
    product_id = amazonReview.getProductID("Island Essence Lotion")
    print(f"Product ID: {product_id}")


Dataset loaded successfully.
Top 5 popular products: [('B000FS05VG', 1), ('B0002ZW5UQ', 2), ('B000ME2YWG', 3), ('B000KK53L6', 4), ('B000052ZTY', 5)]
Product Name: Island Essence Lotion
Product ID: B00012U9GC


In [4]:
from surprise import accuracy
from collections import defaultdict
import itertools


class RecommenderMetrics:

    @staticmethod
    def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)

    @staticmethod
    def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)

    @staticmethod
    def GetTopN(predictions, n=10, minimumRating=4.0):
        topN = defaultdict(list)

        for userID, productID, actualRating, estimatedRating, _ in predictions:
            if estimatedRating >= minimumRating:
                topN[userID].append((productID, estimatedRating))

        for userID, ratings in topN.items():
            ratings.sort(key=lambda x: x[1], reverse=True)
            topN[userID] = ratings[:n]

        return topN

    @staticmethod
    def HitRate(topNPredicted, leftOutPredictions):
        hits = 0
        total = 0

        for leftOut in leftOutPredictions:
            userID = leftOut[0]
            leftOutProductID = leftOut[1]
            hit = any(leftOutProductID == productID for productID, _ in topNPredicted[userID])
            if hit:
                hits += 1
            total += 1

        return hits / total

    @staticmethod
    def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
        hits = 0
        total = 0

        for userID, leftOutProductID, actualRating, _, _ in leftOutPredictions:
            if actualRating >= ratingCutoff:
                hit = any(leftOutProductID == productID for productID, _ in topNPredicted[userID])
                if hit:
                    hits += 1
                total += 1

        return hits / total

    @staticmethod
    def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
        summation = 0
        total = 0

        for userID, leftOutProductID, _, _, _ in leftOutPredictions:
            rank = 0
            for idx, (productID, _) in enumerate(topNPredicted[userID]):
                if productID == leftOutProductID:
                    rank = idx + 1
                    break
            if rank > 0:
                summation += 1.0 / rank
            total += 1

        return summation / total

    @staticmethod
    def Diversity(topNPredicted, simsAlgo):
        n = 0
        total = 0
        simsMatrix = simsAlgo.compute_similarities()

        for userID in topNPredicted.keys():
            pairs = itertools.combinations(topNPredicted[userID], 2)
            for product1, product2 in pairs:
                innerID1 = simsAlgo.trainset.to_inner_iid(product1[0])
                innerID2 = simsAlgo.trainset.to_inner_iid(product2[0])
                similarity = simsMatrix[innerID1][innerID2]
                total += similarity
                n += 1

        return (1 - total / n) if n > 0 else 0

    @staticmethod
    def Novelty(topNPredicted, rankings):
        n = 0
        total = 0

        for userID in topNPredicted.keys():
            for productID, _ in topNPredicted[userID]:
                rank = rankings[productID]
                total += rank
                n += 1

        return total / n if n > 0 else 0

    @staticmethod
    def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
        hits = 0
        for userID in topNPredicted.keys():
            hit = False
            # Iterate over the recommended products for each user
            for productID, predictedRating in topNPredicted[userID]:
                if predictedRating >= ratingThreshold:
                    hit = True
                    break
            if hit:
                hits += 1

        return hits / numUsers

In [5]:
from surprise.model_selection import train_test_split, LeaveOneOut
from surprise import KNNBaseline
import gc
from scipy.sparse import coo_matrix


class EvaluationData:
    def __init__(self, dataset, popularityRankings):
        print("Creating an evaluation data...")
        self.rankings = popularityRankings

        # Build a full training set for evaluating overall properties
        self.fullTrainSet = dataset.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        print("Number of users in the full trainset:", self.fullTrainSet.n_users)
        print("Number of items in the full trainset:", self.fullTrainSet.n_items)

        print("Full trainset: ", self.fullTrainSet)

        # Build a 75/25 train/test split for measuring accuracy
        print("Building train set and test set...")
        self.trainSet, self.testSet = train_test_split(dataset, test_size=.25, random_state=1)

        # Build a "leave one out" train/test split for evaluating top-N recommenders
        print("Building LOOCV train set and test set...")
        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(dataset):
            self.LOOCVTrain = train
            self.LOOCVTest = test

        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        # Compute similarity matrix between items for measuring diversity
        print("Building item similarity matrix...")
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)


    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestSetForUser(self, testSubject):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                         i in trainset.all_items() if i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetLOOCVTrainSet(self):
        return self.LOOCVTrain

    def GetLOOCVTestSet(self):
        return self.LOOCVTest

    def GetLOOCVAntiTestSet(self):
        return self.LOOCVAntiTestSet

    def GetSimilarities(self):
        return self.simsAlgo

    def GetPopularityRankings(self):
        return self.rankings


In [7]:

class EvaluatedAlgorithm:

    def __init__(self, algorithm, name):
        self.algorithm = algorithm
        self.name = name

    def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
        metrics = {}
        # Compute accuracy
        if (verbose):
            print("Evaluating accuracy...")
        self.algorithm.fit(evaluationData.GetTrainSet())
        predictions = self.algorithm.test(evaluationData.GetTestSet())
        metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
        metrics["MAE"] = RecommenderMetrics.MAE(predictions)

        if (doTopN):
            # Evaluate top-10 with Leave One Out testing
            if (verbose):
                print("Evaluating top-N with leave-one-out...")
            self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
            leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
            # Build predictions for all ratings not in the training set
            allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
            # Compute top 10 recs for each user
            topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
            if (verbose):
                print("Computing hit-rate and rank metrics...")
            # See how often we recommended a movie the user actually rated
            metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
            # See how often we recommended a movie the user actually liked
            metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
            # Compute ARHR
            metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)

            #Evaluate properties of recommendations on full training set
            if (verbose):
                print("Computing recommendations with full data set...")
            self.algorithm.fit(evaluationData.GetFullTrainSet())
            allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
            topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
            if (verbose):
                print("Analyzing coverage, diversity, and novelty...")
            # Print user coverage with a minimum predicted rating of 4.0:
            metrics["Coverage"] = RecommenderMetrics.UserCoverage(  topNPredicted,
                                                                   evaluationData.GetFullTrainSet().n_users,
                                                                   ratingThreshold=4.0)
            # Measure diversity of recommendations:
            metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())

            # Measure novelty (average popularity rank of recommendations):
            metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
                                                            evaluationData.GetPopularityRankings())

        if (verbose):
            print("Analysis complete.")

        return metrics

    def GetName(self):
        return self.name

    def GetAlgorithm(self):
        return self.algorithm



In [8]:


class Evaluator:
    algorithms = []

    def __init__(self, dataset, rankings, amazonReview):
        # Khởi tạo dataset và ánh xạ productID ↔ title
        ed = EvaluationData(dataset, rankings)
        self.dataset = ed
        self.amazonReview = amazonReview  # Thêm đối tượng amazonReview để lấy tên sản phẩm từ productID

    def AddAlgorithm(self, algorithm, name):
        alg = EvaluatedAlgorithm(algorithm, name)
        self.algorithms.append(alg)

    def Evaluate(self, doTopN):
        results = {}
        for algorithm in self.algorithms:
            print("Evaluating ", algorithm.GetName(), "...")
            results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)

        # In kết quả
        print("\n")

        if doTopN:
            print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
                "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
            for name, metrics in results.items():
                print("{:<15} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
                    name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
                    metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
        else:
            print("{:<15} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
            for name, metrics in results.items():
                print("{:<15} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))

        print("\nLegend:\n")
        print("RMSE:      Root Mean Squared Error. Lower values mean better accuracy.")
        print("MAE:       Mean Absolute Error. Lower values mean better accuracy.")
        if doTopN:
            print("HR:        Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
            print("cHR:       Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
            print("ARHR:      Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better.")
            print("Coverage:  Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
            print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
            print("           for a given user. Higher means more diverse.")
            print("Novelty:   Average popularity rank of recommended items. Higher means more novel.")

    def SampleTopNRecs(self, testSubject, k=10):
        for algo in self.algorithms:
            print("\nUsing recommender ", algo.GetName())

            print("\nBuilding recommendation model...")
            trainSet = self.dataset.GetFullTrainSet()
            algo.GetAlgorithm().fit(trainSet)

            print("Computing recommendations...")
            testSet = self.dataset.GetAntiTestSetForUser(testSubject)

            predictions = algo.GetAlgorithm().test(testSet)

            recommendations = []

            print("\nWe recommend:")
            for userID, productID, actualRating, estimatedRating, _ in predictions:
                recommendations.append((productID, estimatedRating))

            recommendations.sort(key=lambda x: x[1], reverse=True)

            for productID, rating in recommendations[:k]:
                # Sử dụng AmazonReview để lấy tên sản phẩm từ ID
                productName = self.amazonReview.getProductName(productID)
                if productName:
                    print(f"{productName}: {rating:.2f}")
                else:
                    print(f"Unknown Product (ID: {productID}): {rating:.2f}")


In [10]:
import numpy as np
import tensorflow as tf
import os

class RBM(object):

    def __init__(self, visibleDimensions, epochs=20, hiddenDimensions=50, ratingValues=10, learningRate=0.001, batchSize=100):

        self.visibleDimensions = visibleDimensions
        self.epochs = epochs
        self.hiddenDimensions = hiddenDimensions
        self.ratingValues = ratingValues
        self.learningRate = learningRate
        self.batchSize = batchSize


    def Train(self, X):

        # Initialize weights randomly (earlier versions of thie code had this block inside MakeGraph, but that was a bug.)
        maxWeight = -4.0 * np.sqrt(6.0 / (self.hiddenDimensions + self.visibleDimensions))
        self.weights = tf.Variable(tf.random.uniform([self.visibleDimensions, self.hiddenDimensions], minval=-maxWeight, maxval=maxWeight), tf.float32, name="weights")
        self.hiddenBias = tf.Variable(tf.zeros([self.hiddenDimensions], tf.float32, name="hiddenBias"))
        self.visibleBias = tf.Variable(tf.zeros([self.visibleDimensions], tf.float32, name="visibleBias"))

        for epoch in range(self.epochs):

            trX = np.array(X)
            for i in range(0, trX.shape[0], self.batchSize):
                epochX = trX[i:i+self.batchSize]
                self.MakeGraph(epochX)

            print("Trained epoch ", epoch)


    def GetRecommendations(self, inputUser):

        feed = self.MakeHidden(inputUser)
        rec = self.MakeVisible(feed)
        return rec[0]

    def MakeGraph(self, inputUser):

        # Perform Gibbs Sampling for Contrastive Divergence, per the paper we assume k=1 instead of iterating over the
        # forward pass multiple times since it seems to work just fine

        # Forward pass
        # Sample hidden layer given visible...
        # Get tensor of hidden probabilities
        hProb0 = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
        # Sample from all of the distributions
        hSample = tf.nn.relu(tf.sign(hProb0 - tf.random.uniform(tf.shape(hProb0))))
        # Stitch it together
        forward = tf.matmul(tf.transpose(inputUser), hSample)

        # Backward pass
        # Reconstruct visible layer given hidden layer sample
        v = tf.matmul(hSample, tf.transpose(self.weights)) + self.visibleBias

        # Build up our mask for missing ratings
        vMask = tf.sign(inputUser) # Make sure everything is 0 or 1
        vMask3D = tf.reshape(vMask, [tf.shape(v)[0], -1, self.ratingValues]) # Reshape into arrays of individual ratings
        vMask3D = tf.reduce_max(vMask3D, axis=[2], keepdims=True) # Use reduce_max to either give us 1 for ratings that exist, and 0 for missing ratings

        # Extract rating vectors for each individual set of 10 rating binary values
        v = tf.reshape(v, [tf.shape(v)[0], -1, self.ratingValues])
        vProb = tf.nn.softmax(v * vMask3D) # Apply softmax activation function
        vProb = tf.reshape(vProb, [tf.shape(v)[0], -1]) # And shove them back into the flattened state. Reconstruction is done now.
        # Stitch it together to define the backward pass and updated hidden biases
        hProb1 = tf.nn.sigmoid(tf.matmul(vProb, self.weights) + self.hiddenBias)
        backward = tf.matmul(tf.transpose(vProb), hProb1)

        # Now define what each epoch will do...
        # Run the forward and backward passes, and update the weights
        weightUpdate = self.weights.assign_add(self.learningRate * (forward - backward))
        # Update hidden bias, minimizing the divergence in the hidden nodes
        hiddenBiasUpdate = self.hiddenBias.assign_add(self.learningRate * tf.reduce_mean(hProb0 - hProb1, 0))
        # Update the visible bias, minimizng divergence in the visible results
        visibleBiasUpdate = self.visibleBias.assign_add(self.learningRate * tf.reduce_mean(inputUser - vProb, 0))

        self.update = [weightUpdate, hiddenBiasUpdate, visibleBiasUpdate]

    def MakeHidden(self, inputUser):
        hidden = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
        self.MakeGraph(inputUser)
        return hidden

    def MakeVisible(self, feed):
        visible = tf.nn.sigmoid(tf.matmul(feed, tf.transpose(self.weights)) + self.visibleBias)
        #self.MakeGraph(feed)
        return visible

    def save_model(self, path):
        """
        Lưu trạng thái của RBM vào file.
        """
        if not os.path.exists(path):
            os.makedirs(path)

        # Sử dụng tf.saved_model để lưu weights và biases
        np.save(os.path.join(path, 'weights.npy'), self.weights.numpy())
        np.save(os.path.join(path, 'hiddenBias.npy'), self.hiddenBias.numpy())
        np.save(os.path.join(path, 'visibleBias.npy'), self.visibleBias.numpy())
        print(f"Model saved to {path}")

    def load_model(self, path):
        """
        Tải trạng thái của RBM từ file.
        """
        if not os.path.exists(path):
            raise FileNotFoundError(f"Model path {path} does not exist.")

        self.weights = tf.Variable(np.load(os.path.join(path, 'weights.npy')), dtype=tf.float32)
        self.hiddenBias = tf.Variable(np.load(os.path.join(path, 'hiddenBias.npy')), dtype=tf.float32)
        self.visibleBias = tf.Variable(np.load(os.path.join(path, 'visibleBias.npy')), dtype=tf.float32)
        print(f"Model loaded from {path}")

In [11]:
from surprise import AlgoBase
from surprise import PredictionImpossible
import numpy as np
import os

class RBMAlgorithm(AlgoBase):

    def __init__(self, epochs=20, hiddenDim=100, learningRate=0.001, batchSize=100, model_path='rbm_model', use_saved_model=False, sim_options={}):
        AlgoBase.__init__(self)
        self.epochs = epochs
        self.hiddenDim = hiddenDim
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.model_path = model_path
        self.rbm = None
        self.use_saved_model = use_saved_model

    def softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)

    def fit(self, trainset, predictForAllUsers=True):
        AlgoBase.fit(self, trainset)

        numUsers = trainset.n_users
        numItems = trainset.n_items

        trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)

        for (uid, iid, rating) in trainset.all_ratings():
            adjustedRating = int(float(rating) * 2.0) - 1
            trainingMatrix[int(uid), int(iid), adjustedRating] = 1

        # Flatten to a 2D array
        trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])

        # Nếu đã tồn tại mô hình được lưu trước đó, tải lên
        if os.path.exists(self.model_path) and self.use_saved_model:
            print("Loading trained model...")
            self.rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
            self.rbm.load_model(self.model_path)
        else:
            # Tạo RBM mới và huấn luyện
            print("Training new model...")
            self.rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
            self.rbm.Train(trainingMatrix)
            self.rbm.save_model(self.model_path)

        # Lưu dự đoán đã tính toán trước
        if predictForAllUsers:
            self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
            for uiid in range(trainset.n_users):
                if uiid % 50 == 0:
                    print("Processing user ", uiid)
                recs = self.rbm.GetRecommendations([trainingMatrix[uiid]])
                recs = np.reshape(recs, [numItems, 10])

                for itemID, rec in enumerate(recs):
                    normalized = self.softmax(rec)
                    rating = np.average(np.arange(10), weights=normalized)
                    self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5

        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        rating = self.predictedRatings[u, i]

        if (rating < 0.001):
            raise PredictionImpossible('No valid prediction exists.')

        return rating

    def GetRecommendations(self, trainset, userId):
        AlgoBase.fit(self, trainset)

        numUsers = trainset.n_users
        numItems = trainset.n_items

        trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)

        for (uid, iid, rating) in trainset.all_ratings():
            adjustedRating = int(float(rating) * 2.0) - 1
            trainingMatrix[int(uid), int(iid), adjustedRating] = 1

        # Flatten to a 2D array
        trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])

        uiid = trainset.to_inner_uid(str(userId))


        recs = self.rbm.GetRecommendations([trainingMatrix[uiid]])
        recs = np.reshape(recs, [numItems, 10])

        self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)

        for itemID, rec in enumerate(recs):
            normalized = self.softmax(rec)
            rating = np.average(np.arange(10), weights=normalized)
            self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5

In [12]:

from surprise import NormalPredictor
from surprise.model_selection import GridSearchCV
import gc
import time
import psutil
import os
import random
import numpy as np


def LoadAmazonReviewData():
    amazonReview = AmazonReview()
    print("Loading Amazon ratings...")
    # Load dataset
    data = amazonReview.loadAmazonReviewDataset()
    print("\nComputing product popularity ranks so we can measure novelty later...")
    # Get product popularity ranks
    rankings = amazonReview.getPopularityRanks()
    return (amazonReview, data, rankings)

np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(amazonReview, evaluationData, rankings) = LoadAmazonReviewData()

print("Searching for best parameters...")
param_grid = {'hiddenDim': [120, 50], 'learningRate': [0.00001, 0.05]}
gs = GridSearchCV(RBMAlgorithm, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(evaluationData)

# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings, amazonReview)

params = gs.best_params['rmse']
RBMtuned = RBMAlgorithm(hiddenDim = params['hiddenDim'], learningRate = params['learningRate'])
# RBMtuned = RBMAlgorithm(hiddenDim = 5000, learningRate = 0.00001)

evaluator.AddAlgorithm(RBMtuned, "RBM - Tuned")

# RBMUntuned = RBMAlgorithm()
# evaluator.AddAlgorithm(RBMUntuned, "RBM - Untuned")

# Fight!
evaluator.Evaluate(True)

Loading Amazon ratings...

Computing product popularity ranks so we can measure novelty later...
Searching for best parameters...
Training new model...
Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Trained epoch  10
Trained epoch  11
Trained epoch  12
Trained epoch  13
Trained epoch  14
Trained epoch  15
Trained epoch  16
Trained epoch  17
Trained epoch  18
Trained epoch  19
Model saved to rbm_model
Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Processing user  300
Processing user  350
Processing user  400
Processing user  450
Processing user  500
Processing user  550
Processing user  600
Processing user  650
Processing user  700
Processing user  750
Processing user  800
Processing user  850
Processing user  900
Processing user  950
Processing user  1000
Processing user  1050
Processing u