In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import csv
import sys

from surprise import Dataset
from surprise import Reader

from collections import defaultdict

class AmazonReview:
    productID_to_name = {}
    name_to_productID = {}

    # Tập này sẽ được xử lý như sau: file Beaty.csv gốc sẽ được lọc ra các cột cần thiết (userId, productId, score, time)
    ratingsPath = '/home/haphuthinh/Workplace/School_project/do-an-1/Recommender-system-UIT/AmazonRatingData/Beauty-rating.csv'

    # Tập này sẽ được xử lý như sau: file Beaty.csv gốc sẽ được lọc ra các cột cần thiết (productId, title)
    productsPath = '/home/haphuthinh/Workplace/School_project/do-an-1/Recommender-system-UIT/AmazonRatingData/Beauty-product.csv'

    def loadAmazonReviewDataset(self):
        # Định dạng reader cho tập ratings
        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        # Tải dữ liệu ratings vào Surprise Dataset
        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        # Đọc file products để xây dựng ánh xạ productID ↔ title
        self.productID_to_name = {}
        self.name_to_productID = {}
        with open(self.productsPath, newline='', encoding='ISO-8859-1') as csvfile:
            productReader = csv.reader(csvfile)
            next(productReader)  # Bỏ qua header
            for row in productReader:
                productID = row[0]
                productName = row[1]
                self.productID_to_name[productID] = productName
                self.name_to_productID[productName] = productID

        return ratingsDataset

    def getPopularityRanks(self):
        # Tính độ phổ biến của sản phẩm từ tập ratings
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        with open(self.ratingsPath, newline='', encoding='utf-8') as csvfile:
            reviewReader = csv.reader(csvfile)
            next(reviewReader)
            for row in reviewReader:
                productID = row[1]
                ratings[productID] += 1
        rank = 1
        for productID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[productID] = rank
            rank += 1
        return rankings

    def getProductName(self, productID):
        # Lấy tên sản phẩm từ productID
        return self.productID_to_name.get(productID, "")

    def getProductID(self, productName):
        # Lấy productID từ tên sản phẩm
        return self.name_to_productID.get(productName, 0)

# Example usage:
if __name__ == "__main__":
    amazonReview = AmazonReview()
    dataset = amazonReview.loadAmazonReviewDataset()
    print("Dataset loaded successfully.")
    print("Top 5 popular products:", list(amazonReview.getPopularityRanks().items())[:5])

    # Lấy tên sản phẩm từ ID
    product_name = amazonReview.getProductName("B00012U9F8")
    print(f"Product Name: {product_name}")

    # Lấy ID từ tên sản phẩm
    product_id = amazonReview.getProductID("Island Essence Lotion")
    print(f"Product ID: {product_id}")


Dataset loaded successfully.
Top 5 popular products: [('B000FS05VG', 1), ('B0002ZW5UQ', 2), ('B000ME2YWG', 3), ('B000KK53L6', 4), ('B000052ZTY', 5)]
Product Name: Island Essence Lotion
Product ID: B00012U9GC


In [2]:
import numpy as np
import tensorflow as tf

class AutoRec(object):

    def __init__(self, visibleDimensions, epochs=200, hiddenDimensions=50, learningRate=0.1, batchSize=100):

        self.visibleDimensions = visibleDimensions
        self.epochs = epochs
        self.hiddenDimensions = hiddenDimensions
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.optimizer = tf.keras.optimizers.RMSprop(self.learningRate)
        
                
    def Train(self, X):
        
        self.initialize_weights_biases()
        for epoch in range(self.epochs):
            for i in range(0, X.shape[0], self.batchSize):
                epochX = X[i:i+self.batchSize]
                self.run_optimization(epochX)


            print("Trained epoch ", epoch)

    def GetRecommendations(self, inputUser):
                
        # Feed through a single user and return predictions from the output layer.
        rec = self.neural_net(inputUser)
        
        # It is being used as the return type is Eager Tensor.
        return rec[0]
    
    def initialize_weights_biases(self):
        # Create varaibles for weights for the encoding (visible->hidden) and decoding (hidden->output) stages, randomly initialized
        self.weights = {
            'h1': tf.Variable(tf.random.normal([self.visibleDimensions, self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.hiddenDimensions, self.visibleDimensions]))
            }
        
        # Create biases
        self.biases = {
            'b1': tf.Variable(tf.random.normal([self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.visibleDimensions]))
            }
    
    def neural_net(self, inputUser):

        #tf.set_random_seed(0)
        
        # Initialization of weights and biases was moved out to the initialize_weights_biases function above
        # This lets us avoid resetting them on every batch of training, which was a bug in earlier versions of
        # this script.
        
        # Create the input layer
        self.inputLayer = inputUser
        
        # hidden layer
        hidden = tf.nn.sigmoid(tf.add(tf.matmul(self.inputLayer, self.weights['h1']), self.biases['b1']))
        
        # output layer for our predictions.
        self.outputLayer = tf.nn.sigmoid(tf.add(tf.matmul(hidden, self.weights['out']), self.biases['out']))
        
        return self.outputLayer
    
    def run_optimization(self, inputUser):
        with tf.GradientTape() as g:
            pred = self.neural_net(inputUser)
            loss = tf.keras.losses.MSE(inputUser, pred)
            
        trainable_variables = list(self.weights.values()) + list(self.biases.values())
        
        gradients = g.gradient(loss, trainable_variables)
        
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))

from surprise import AlgoBase
from surprise import PredictionImpossible
import numpy as np

class AutoRecAlgorithm(AlgoBase):

    def __init__(self, epochs=100, hiddenDim=100, learningRate=0.01, batchSize=100, sim_options={}):
        AlgoBase.__init__(self)
        self.epochs = epochs
        self.hiddenDim = hiddenDim
        self.learningRate = learningRate
        self.batchSize = batchSize

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        numUsers = trainset.n_users
        numItems = trainset.n_items
        
        trainingMatrix = np.zeros([numUsers, numItems], dtype=np.float32)
        
        for (uid, iid, rating) in trainset.all_ratings():
            trainingMatrix[int(uid), int(iid)] = rating / 5.0
        
        # Create an RBM with (num items * rating values) visible nodes
        autoRec = AutoRec(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
        autoRec.Train(trainingMatrix)

        self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
        
        for uiid in range(trainset.n_users):
            if (uiid % 50 == 0):
                print("Processing user ", uiid)
            recs = autoRec.GetRecommendations([trainingMatrix[uiid]])
            
            for itemID, rec in enumerate(recs):
                self.predictedRatings[uiid, itemID] = rec * 5.0
        
        return self


    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        
        rating = self.predictedRatings[u, i]
        
        if (rating < 0.001):
            raise PredictionImpossible('No valid prediction exists.')
            
        return rating
    

In [5]:
from surprise import accuracy
from collections import defaultdict
import itertools


class RecommenderMetrics:

    @staticmethod
    def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)

    @staticmethod
    def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)

    @staticmethod
    def GetTopN(predictions, n=10, minimumRating=4.0):
        topN = defaultdict(list)

        for userID, productID, actualRating, estimatedRating, _ in predictions:
            if estimatedRating >= minimumRating:
                topN[userID].append((productID, estimatedRating))

        for userID, ratings in topN.items():
            ratings.sort(key=lambda x: x[1], reverse=True)
            topN[userID] = ratings[:n]

        return topN

    @staticmethod
    def HitRate(topNPredicted, leftOutPredictions):
        hits = 0
        total = 0

        for leftOut in leftOutPredictions:
            userID = leftOut[0]
            leftOutProductID = leftOut[1]
            hit = any(leftOutProductID == productID for productID, _ in topNPredicted[userID])
            if hit:
                hits += 1
            total += 1

        return hits / total

    @staticmethod
    def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
        hits = 0
        total = 0

        for userID, leftOutProductID, actualRating, _, _ in leftOutPredictions:
            if actualRating >= ratingCutoff:
                hit = any(leftOutProductID == productID for productID, _ in topNPredicted[userID])
                if hit:
                    hits += 1
                total += 1

        return hits / total

    @staticmethod
    def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
        summation = 0
        total = 0

        for userID, leftOutProductID, _, _, _ in leftOutPredictions:
            rank = 0
            for idx, (productID, _) in enumerate(topNPredicted[userID]):
                if productID == leftOutProductID:
                    rank = idx + 1
                    break
            if rank > 0:
                summation += 1.0 / rank
            total += 1

        return summation / total

    @staticmethod
    def Diversity(topNPredicted, simsAlgo):
        n = 0
        total = 0
        simsMatrix = simsAlgo.compute_similarities()

        for userID in topNPredicted.keys():
            pairs = itertools.combinations(topNPredicted[userID], 2)
            for product1, product2 in pairs:
                innerID1 = simsAlgo.trainset.to_inner_iid(product1[0])
                innerID2 = simsAlgo.trainset.to_inner_iid(product2[0])
                similarity = simsMatrix[innerID1][innerID2]
                total += similarity
                n += 1

        return (1 - total / n) if n > 0 else 0

    @staticmethod
    def Novelty(topNPredicted, rankings):
        n = 0
        total = 0

        for userID in topNPredicted.keys():
            for productID, _ in topNPredicted[userID]:
                rank = rankings[productID]
                total += rank
                n += 1

        return total / n if n > 0 else 0

    @staticmethod
    def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
        hits = 0
        for userID in topNPredicted.keys():
            hit = False
            # Iterate over the recommended products for each user
            for productID, predictedRating in topNPredicted[userID]:
                if predictedRating >= ratingThreshold:
                    hit = True
                    break
            if hit:
                hits += 1

        return hits / numUsers

In [6]:
from surprise.model_selection import train_test_split, LeaveOneOut
from surprise import KNNBaseline
import gc
from scipy.sparse import coo_matrix


class EvaluationData:
    def __init__(self, dataset, popularityRankings):
        print("Creating an evaluation data...")
        self.rankings = popularityRankings

        # Build a full training set for evaluating overall properties
        self.fullTrainSet = dataset.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        print("Number of users in the full trainset:", self.fullTrainSet.n_users)
        print("Number of items in the full trainset:", self.fullTrainSet.n_items)

        print("Full trainset: ", self.fullTrainSet)

        # Build a 75/25 train/test split for measuring accuracy
        print("Building train set and test set...")
        self.trainSet, self.testSet = train_test_split(dataset, test_size=.25, random_state=1)

        # Build a "leave one out" train/test split for evaluating top-N recommenders
        print("Building LOOCV train set and test set...")
        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(dataset):
            self.LOOCVTrain = train
            self.LOOCVTest = test

        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        # Compute similarity matrix between items for measuring diversity
        print("Building item similarity matrix...")
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)


    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestSetForUser(self, testSubject):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                         i in trainset.all_items() if i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetLOOCVTrainSet(self):
        return self.LOOCVTrain

    def GetLOOCVTestSet(self):
        return self.LOOCVTest

    def GetLOOCVAntiTestSet(self):
        return self.LOOCVAntiTestSet

    def GetSimilarities(self):
        return self.simsAlgo

    def GetPopularityRankings(self):
        return self.rankings


In [10]:
from surprise import NormalPredictor
import gc
import time
import psutil
import os
import random
import numpy as np

def LoadAmazonReviewData():
    amazonReview = AmazonReview()
    print("Loading Amazon ratings...")
    # Load dataset
    data = amazonReview.loadAmazonReviewDataset()
    print("\nComputing product popularity ranks so we can measure novelty later...")
    # Get product popularity ranks
    rankings = amazonReview.getPopularityRanks()
    return (amazonReview, data, rankings)

np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(amazonReview, evaluationData, rankings) = LoadAmazonReviewData()

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings, amazonReview)

#Autoencoder
AutoRec = AutoRecAlgorithm()
evaluator.AddAlgorithm(AutoRec, "AutoRec")

# Fight!
evaluator.Evaluate(True)


# Clean up memory
gc.collect()

# Start timing
start_time = time.time()

##########################################
evaluator.SampleTopNRecs("A3PB71Q63XF43G")
##########################################

# End timing
end_time = time.time()

# Calculate total time
total_time = end_time - start_time
print(f"Total time for recommending top N: {total_time} seconds")

# Get the process ID
pid = os.getpid()

# Get the process
process = psutil.Process(pid)

# Get the memory info
memory_info = process.memory_info()

# Print the memory usage
print(f"Memory usage: {memory_info.rss / (1024 * 1024)} MB")

# Clean up memory
gc.collect()

Loading Amazon ratings...

Computing product popularity ranks so we can measure novelty later...
Creating an evaluation data...
Number of users in the full trainset: 9238
Number of items in the full trainset: 1346
Full trainset:  <surprise.trainset.Trainset object at 0x7f211872cb38>
Building train set and test set...
Building LOOCV train set and test set...
Building item similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating  AutoRec ...
Evaluating accuracy...


TypeError: 'AutoRecAlgorithm' object is not callable