Merge pull request #27 from guedes-joaofelipe/dev-guedesjoaofelipe

Including Non-negative Matrix Factorization and fixing RatingPredictionEvaluation
caserec · Jan 22, 2019 · 9ba8abf · 9ba8abf
2 parents 5606db9 + cfc0cf8
commit 9ba8abf
Show file tree

Hide file tree

Showing 24 changed files with 267 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,7 @@ $RECYCLE.BIN/
 .DS_Store
 .AppleDouble
 .LSOverride
+*.ini
 
 # Thumbnails
 ._*
@@ -44,3 +45,12 @@ Temporary Items
 
 .idea/*
 .vscode/*
+*.pyc
+*.ipynb_checkpoints
+.ipynb_checkpoints
+
+
+# Ignoring wheel folders
+build/
+dist/
+CaseRecommender.egg-info/
diff --git a/README.md b/README.md
@@ -35,6 +35,8 @@ Rating Prediction:
 
 - Matrix Factorization (with and without baseline)
 
+- Non-negative Matrix Factorization
+
 - SVD
 
 - SVD++

diff --git a/README.rst b/README.rst
@@ -37,6 +37,8 @@ Rating Prediction:
 
 - SVD
 
+- Non-negative Matrix Factorization
+
 - SVD++
 
 - ItemKNN

diff --git a/caserec/__init__.py b/caserec/__init__.py
@@ -1 +0,0 @@
-__author__ = "Arthur Fortes"

diff --git a/caserec/evaluation/__pycache__/__init__.cpython-37.pyc b/caserec/evaluation/__pycache__/__init__.cpython-37.pyc
diff --git a/caserec/evaluation/__pycache__/base_evaluation.cpython-37.pyc b/caserec/evaluation/__pycache__/base_evaluation.cpython-37.pyc
diff --git a/caserec/evaluation/__pycache__/item_recomendation_functions.cpython-37.pyc b/caserec/evaluation/__pycache__/item_recomendation_functions.cpython-37.pyc
diff --git a/caserec/evaluation/__pycache__/item_recommendation.cpython-37.pyc b/caserec/evaluation/__pycache__/item_recommendation.cpython-37.pyc
diff --git a/caserec/evaluation/__pycache__/rating_prediction.cpython-37.pyc b/caserec/evaluation/__pycache__/rating_prediction.cpython-37.pyc
diff --git a/caserec/evaluation/base_evaluation.py b/caserec/evaluation/base_evaluation.py
@@ -18,7 +18,7 @@
 
 
 class BaseEvaluation(object):
-    def __init__(self, sep='\t', metrics=None, all_but_one_eval=False, verbose=True, as_table=False, table_sep='\t'):
+    def __init__(self, sep='\t', metrics=None, all_but_one_eval=False, verbose=True, as_table=False, table_sep='\t', save_eval_file = None):
         """
         Class to be base for evaluation strategies
 
@@ -143,7 +143,7 @@ def evaluate_folds(self, folds_dir, predictions_file_name, test_file_name, k_fol
 
         return folds_results
 
-    def print_results(self, evaluation_results):
+    def print_results(self, evaluation_results, save_eval_file = None):
         """
         Method to print the results
 

diff --git a/caserec/evaluation/item_recommendation.py b/caserec/evaluation/item_recommendation.py
@@ -125,6 +125,12 @@ def evaluate(self, predictions, test_set):
 
             })
 
+        # if (self.save_eval_file is not None):
+        #     # Saving evaluations to a file 
+        #     from caserec.utils.process_data import WriteFile
+
+        #     WriteFile(output_file=save_eval_file, data=)
+
         if self.verbose:
             self.print_results(eval_results)
 

diff --git a/caserec/evaluation/rating_prediction.py b/caserec/evaluation/rating_prediction.py
@@ -120,8 +120,8 @@ def evaluate(self, predictions, test_set):
             new_test_set['items_seen_by_user'] = new_test_set
             new_test_set['users'] = test_set['users']
 
-            ItemRecommendationEvaluation(n_ranks=self.n_rank,
-                                         all_but_one_eval=self.all_but_one_eval).evaluate_recommender(
-                new_predict_set, new_test_set)
+            eval_results = ItemRecommendationEvaluation(n_ranks=self.n_rank,
+                                         all_but_one_eval=self.all_but_one_eval,
+                                         metrics=self.metrics).evaluate_recommender(new_predict_set, new_test_set)
 
         return eval_results
diff --git a/caserec/recommenders/__pycache__/__init__.cpython-37.pyc b/caserec/recommenders/__pycache__/__init__.cpython-37.pyc
diff --git a/caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc b/caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc
diff --git a/caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc b/caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc
diff --git a/caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc b/caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc
diff --git a/caserec/recommenders/rating_prediction/base_rating_prediction.py b/caserec/recommenders/rating_prediction/base_rating_prediction.py
@@ -74,10 +74,15 @@ def read_files(self):
 
         """
 
-        self.train_set = ReadFile(self.train_file, sep=self.sep).read()
+        # Getting train_set as a dict_file = {'feedback': dict_feedback, 'users': list_users, 'items': list_items, 
+        #               'sparsity': sparsity, 'number_interactions': number_interactions, 'users_viewed_item': users_viewed_item, 'items_unobserved': items_unobserved,
+        #               'items_seen_by_user': items_seen_by_user, 'mean_value': mean_value, 'max_value': max(list_feedback), 'min_value': min(list_feedback)}
+        self.train_set = ReadFile(self.train_file, sep=self.sep).read() 
 
         if self.test_file is not None:
             self.test_set = ReadFile(self.test_file, sep=self.sep).read()
+
+            # Combining users/items from train and test set
             self.users = sorted(set(list(self.train_set['users']) + list(self.test_set['users'])))
             self.items = sorted(set(list(self.train_set['items']) + list(self.test_set['items'])))
         else:
@@ -93,7 +98,7 @@ def read_files(self):
 
     def create_matrix(self):
         """
-        Method to create a feedback matrix
+        Method to create a feedback matrix having users as rows and items as columns
 
         """
 

diff --git a/caserec/recommenders/rating_prediction/nnmf.py b/caserec/recommenders/rating_prediction/nnmf.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+"""
+    Non-negative Matrix Factorization
+    [Rating Prediction]
+
+    Literature:
+        Badrul Sarwar , George Karypis , Joseph Konstan , John Riedl:
+        Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems
+        Fifth International Conference on Computer and Information Science 2002.
+        http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.7894
+
+"""
+
+# © 2018. Case Recommender (MIT License)
+
+import numpy as np
+from sklearn.decomposition import NMF 
+
+from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
+from caserec.utils.extra_functions import timed
+
+__author__ = 'Joao Felipe Guedes <guedes.joaofelipe@poli.ufrj.br>'
+
+
+class NNMF(BaseRatingPrediction):
+    def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, sep='\t', output_sep='\t',
+                 random_seed=None):
+        """
+        Matrix Factorization for rating prediction
+
+        Matrix factorization models map both users and items to a joint latent factor space of dimensionality f,
+        such that user-item interactions are modeled as inner products in that space.
+
+        Usage::
+
+            >> MatrixFactorization(train, test).compute()
+
+        :param train_file: File which contains the train set. This file needs to have at least 3 columns
+        (user item feedback_value).
+        :type train_file: str
+
+        :param test_file: File which contains the test set. This file needs to have at least 3 columns
+        (user item feedback_value).
+        :type test_file: str, default None
+
+        :param output_file: File with dir to write the final predictions
+        :type output_file: str, default None
+
+        :param factors: Number of latent factors per user/item
+        :type factors: int, default 10
+
+        :param sep: Delimiter for input files
+        :type sep: str, default '\t'
+
+        :param output_sep: Delimiter for output file
+        :type output_sep: str, default '\t'
+
+        :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
+        :type random_seed: int, default None
+
+        """
+        super(NNMF, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep,
+                                  output_sep=output_sep)
+
+        self.recommender_name = 'NNMF'
+        self.factors = factors
+
+        if random_seed is not None:
+            np.random.seed(random_seed)
+
+        # internal vars
+        self.feedback_triples = None
+        self.prediction_matrix = None
+
+    def init_model(self):
+        """
+        Method to treat and initialize the model
+
+        """
+
+        self.feedback_triples = []
+
+        # Map interaction with ids
+        for user in self.train_set['feedback']:
+            for item in self.train_set['feedback'][user]:
+                self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item],
+                                              self.train_set['feedback'][user][item]))
+
+        self.create_matrix()
+
+    def fit(self):
+        """
+        This method performs Non-negative matrix factorization over the training data.
+
+        """
+
+        model = NMF(n_components=self.factors, init='random', random_state=0)
+
+        P = model.fit_transform(self.matrix)
+
+        Q = model.components_
+
+        self.prediction_matrix = np.dot(P, Q)
+
+    def predict_score(self, u, i, cond=True):
+        """
+        Method to predict a single score for a pair (user, item)
+
+        :param u: User ID
+        :type u: int
+
+        :param i: Item ID
+        :type i: int
+
+        :param cond: Use max and min values of train set to limit score
+        :type cond: bool, default True
+
+        :return: Score generate for pair (user, item)
+        :rtype: float
+
+        """
+
+        rui = self.train_set["mean_value"] + self.prediction_matrix[u][i]
+
+        if cond:
+            if rui > self.train_set["max_value"]:
+                rui = self.train_set["max_value"]
+            elif rui < self.train_set["min_value"]:
+                rui = self.train_set["min_value"]
+
+        return rui
+
+    def predict(self):
+        """
+        This method computes a final rating for unknown pairs (user, item)
+
+        """
+
+        if self.test_file is not None:
+            for user in self.test_set['users']:
+                for item in self.test_set['feedback'][user]:
+                    self.predictions.append((user, item, self.predict_score(self.user_to_user_id[user],
+                                                                            self.item_to_item_id[item], True)))
+        else:
+            raise NotImplemented
+
+    def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
+        """
+        Extends compute method from BaseRatingPrediction. Method to run recommender algorithm
+
+        :param verbose: Print recommender and database information
+        :type verbose: bool, default True
+
+        :param metrics: List of evaluation measures
+        :type metrics: list, default None
+
+        :param verbose_evaluation: Print the evaluation results
+        :type verbose_evaluation: bool, default True
+
+        :param as_table: Print the evaluation results as table
+        :type as_table: bool, default False
+
+        :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
+        :type table_sep: str, default '\t'
+
+        """
+
+        super(NNMF, self).compute(verbose=verbose)
+
+        if verbose:
+            self.init_model()
+            print("training_time:: %4f sec" % timed(self.fit))
+            if self.extra_info_header is not None:
+                print(self.extra_info_header)
+
+            print("prediction_time:: %4f sec" % timed(self.predict))
+
+            print('\n')
+
+        else:
+            # Execute all in silence without prints
+            self.init_model()
+            self.fit()
+            self.predict()
+
+        self.write_predictions()
+
+        if self.test_file is not None:
+            self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
+
+
+
diff --git a/caserec/utils/__pycache__/__init__.cpython-37.pyc b/caserec/utils/__pycache__/__init__.cpython-37.pyc
diff --git a/caserec/utils/__pycache__/extra_functions.cpython-37.pyc b/caserec/utils/__pycache__/extra_functions.cpython-37.pyc
diff --git a/caserec/utils/__pycache__/process_data.cpython-37.pyc b/caserec/utils/__pycache__/process_data.cpython-37.pyc
diff --git a/caserec/utils/process_data.py b/caserec/utils/process_data.py
@@ -63,7 +63,7 @@ def read(self):
 
         list_feedback = []
 
-        dict_feedback = {}
+        dict_feedback = {} # To be filled as: {user_id: [item_id_1, item_id_2, ..., item_id_N]}
         items_unobserved = {}
         items_seen_by_user = {}
         users_viewed_item = {}

diff --git a/examples/ranking_rating_based_algorithm.py b/examples/ranking_rating_based_algorithm.py
@@ -0,0 +1,37 @@
+"""
+    Running Precision and Recall metrics on rating-based algorithms
+
+"""
+
+from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
+from caserec.recommenders.rating_prediction.nnmf import NNMF
+from caserec.utils.process_data import ReadFile
+from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
+
+tr = '../../datasets/ml-100k/folds/0/train.dat'
+te = '../../datasets/ml-100k/folds/0/test.dat'
+
+# File to be saved model's predictions
+predictions_output_filepath = './predictions_output.dat'
+
+# Creating model and computing train / test sets
+# model = MatrixFactorization(tr, te, output_file = predictions_output_filepath)
+model = NNMF(tr, te, output_file = predictions_output_filepath)
+
+model.compute(verbose=False)
+
+# Using ReadFile class to read predictions from file
+reader = ReadFile(input_file=predictions_output_filepath)
+predictions = reader.read()
+
+# Creating evaluator with item-recommendation parameters
+evaluator = RatingPredictionEvaluation(sep = '\t', n_rank = [10], as_rank = True, metrics = ['PREC'])
+
+# Getting evaluation
+item_rec_metrics = evaluator.evaluate(predictions['feedback'], model.test_set)
+
+print ('\nItem Recommendation Metrics:\n', item_rec_metrics)
+
+model.predict()
+
+print ('\nOriginal Rating Prediction Metrics:\n', model.evaluation_results)
diff --git a/examples/rating_prediction_mf.py b/examples/rating_prediction_mf.py
@@ -7,6 +7,7 @@
 """
 
 from caserec.recommenders.rating_prediction.svdplusplus import SVDPlusPlus
+from caserec.recommenders.rating_prediction.nnmf import NNMF
 from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
 from caserec.utils.cross_validation import CrossValidation
 
@@ -28,10 +29,12 @@
 """
 
 # Cross Validation
-recommender = MatrixFactorization()
+# recommender = MatrixFactorization()
 
-CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
+# CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
 
 # # Simple
 # MatrixFactorization(tr, te).compute()
 # SVDPlusPlus(tr, te).compute()
+
+NNMF(tr, te, factors = 20).compute()