Skip to content

Commit

Permalink
Fix bugs; Add PaCo Recommender and Group-based item recommendtion alg…
Browse files Browse the repository at this point in the history
…orithms, change base item recommendation scenario to new metrics. Update ReadMe files
  • Loading branch information
arthurfortes committed Jun 25, 2018
1 parent 5f6da88 commit 4b9b64b
Show file tree
Hide file tree
Showing 9 changed files with 403 additions and 18 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ Item Recommendation:

- User Attribute KNN

- Group-based (Clustering-based algorithm)

- Paco Recommender (Co-Clustering-based algorithm)

- Most Popular

- Random
Expand Down
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ Item Recommendation:

- User Attribute KNN

- Group-based (Clustering-based algorithm)

- Paco Recommender (Co-Clustering-based algorithm)

- Most Popular

- Random
Expand Down
8 changes: 5 additions & 3 deletions caserec/clustering/paco.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@
# © 2018. Case Recommender (MIT License)

import itertools
import random

import numpy as np
from scipy.spatial.distance import squareform, pdist
from sklearn.cluster import KMeans

from caserec_demo.utils.process_data import ReadFile
from caserec.utils.process_data import ReadFile

__author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'


class PaCo(object):
def __init__(self, train_file, k_row=None, l_col=None, density_low=0.008, as_binary=True,
sep='\t', randon_seed=None):
sep='\t', random_seed=None):
"""
PaCo: EntroPy Anomalies in Co-Clustering
Expand Down Expand Up @@ -57,7 +59,7 @@ def __init__(self, train_file, k_row=None, l_col=None, density_low=0.008, as_bin
"""

self.train_set = ReadFile(train_file, as_binary=as_binary).read()
self.train_set = ReadFile(train_file, as_binary=as_binary, sep=sep).read()
self.density_low = density_low
self.users = self.train_set['users']
self.items = self.train_set['items']
Expand Down
3 changes: 2 additions & 1 deletion caserec/evaluation/rating_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
- All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the predictions
"""
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import random

from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
from caserec.evaluation.base_evaluation import BaseEvaluation

__author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np

from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
from caserec.utils.extra_functions import print_header, check_error_file
from caserec.utils.extra_functions import print_header
from caserec.utils.process_data import ReadFile, WriteFile

__author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
Expand Down Expand Up @@ -137,7 +137,7 @@ def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t'):
Method to evaluate the final ranking
:param metrics: List of evaluation metrics
:type metrics: list, default ('Prec@5', 'Prec@10', 'NDCG@5', 'NDCG@10', 'MAP@5', 'MAP@10')
:type metrics: list, default ('Prec', 'Recall', 'MAP, 'NDCG')
:param verbose: Print the evaluation results
:type verbose: bool, default True
Expand All @@ -153,13 +153,11 @@ def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t'):
self.evaluation_results = {}

if metrics is None:
metrics = list(['PREC@5', 'PREC@10', 'NDCG@5', 'NDCG@10', 'MAP@5', 'MAP@10'])
metrics = list(['PREC', 'RECALL', 'MAP', 'NDCG'])

results = ItemRecommendationEvaluation(verbose=verbose, as_table=as_table, table_sep=table_sep,
metrics=metrics).evaluate_recommender(predictions=self.ranking,
test_set=self.test_set)
for metric in metrics:
self.evaluation_results[metric.upper()] = results[metric.upper()]
metrics=metrics)
results.evaluate_recommender(predictions=self.ranking, test_set=self.test_set)

def write_ranking(self):
"""
Expand Down
240 changes: 236 additions & 4 deletions caserec/recommenders/item_recommendation/group_based_recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,246 @@

# © 2018. Case Recommender (MIT License)

from scipy.spatial.distance import squareform, pdist
import numpy as np
import os

from caserec.clustering.kmedoids import kmedoids
from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
from caserec.recommenders.item_recommendation.itemknn import ItemKNN
from caserec.recommenders.item_recommendation.most_popular import MostPopular
from caserec.recommenders.item_recommendation.userknn import UserKNN
from caserec.utils.process_data import ReadFile, WriteFile
from caserec.recommenders.item_recommendation.bprmf import BprMF

__author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'


class GroupBasedRecommender(BaseItemRecommendation):
"""
Code being refactored, returns in the next version.
def __init__(self, train_files=None, test_file=None, output_file=None, similarity_metric="cosine", rank_length=10,
k_groups=3, recommender='UserKNN', as_binary=False, sep='\t', output_sep='\t', max_int_kmedoids=1000,
parser='', user_weights=False):
"""
Group-Based for Item Recommendation
This algorithm predicts a rank for each user using a co-clustering algorithm
Usage::
>> GroupBasedRecommender([train_history], test).compute()
>> GroupBasedRecommender([train_history, train_rating], test, as_binary=True).compute()
:param train_files: List of train files
:type train_files: list
:param test_file: File which contains the test set. This file needs to have at least 3 columns
(user item feedback_value).
:type test_file: str, default None
:param output_file: File with dir to write the final predictions
:type output_file: str, default None
:param similarity_metric: Pairwise metric to compute the similarity between the users. Reference about
distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
:type similarity_metric: str, default cosine
:param as_binary: If True, the explicit feedback will be transform to binary
:type as_binary: bool, default False
:param sep: Delimiter for input files
:type sep: str, default '\t'
:param output_sep: Delimiter for output file
:type output_sep: str, default '\t'
"""

super(GroupBasedRecommender, self).__init__(train_file='', test_file=test_file,
output_file=output_file, as_binary=as_binary,
rank_length=rank_length, similarity_metric=similarity_metric,
sep=sep, output_sep=output_sep)

self.recommender_name = 'Group Based Recommender Algorithm'
self.train_files = train_files
self.k_groups = k_groups
self.recommender = recommender
self.max_int_kmedoids = max_int_kmedoids
self.parser = parser
self.user_weights = user_weights

# internal vars
self.n_files = 0
self.train_set_list = []
self.distance_matrix = None
self.dir_name = None
self.gb_train_files = []
self.weighted_matrices = []
self.k_users_in_cluster = []

def read_files(self):
"""
Method to initialize recommender algorithm.
"""

self.n_files = len(self.train_files)

self.users = []
self.items = []

for train_file in self.train_files:
train_set = ReadFile(train_file, sep=self.sep, as_binary=self.as_binary).read()
self.users += train_set['users']
self.items += train_set['items']
self.train_set_list.append(train_set)
self.dir_name = os.path.dirname(train_file)

self.users = set(self.users)
self.items = set(self.items)

if self.test_file is not None:
self.test_set = ReadFile(self.test_file).read()
self.users = sorted(set(list(self.users) + list(self.test_set['users'])))
self.items = sorted(set(list(self.items) + list(self.test_set['items'])))

for i, item in enumerate(self.items):
self.item_to_item_id.update({item: i})
self.item_id_to_item.update({i: item})
for u, user in enumerate(self.users):
self.user_to_user_id.update({user: u})
self.user_id_to_user.update({u: user})

def compute_distance(self):
"""
Method to compute a distance matrix from train set
"""

# Calculate distance matrix
distance_matrix = np.float32(squareform(pdist(self.matrix, self.similarity_metric)))
# Remove NaNs
distance_matrix[np.isnan(distance_matrix)] = 1.0

return distance_matrix

def create_weighted_matrix(self):
for ts in self.train_set_list:
weighted_matrix = np.ones((len(self.users), len(self.users)))

for u in range(len(self.users)):
user_u = self.user_id_to_user[u]
nu = ts['items_seen_by_user'].get(user_u, [])
if nu:
for v in range(u, len(self.users)):
if u == v:
nuv = len(nu)
else:
user_v = self.user_id_to_user[v]
nv = ts['items_seen_by_user'].get(user_v, [])

# nuv = len(set(nu).intersection(nv)) / (len(nu) + len(nv))
nuv = 1 / (len(nu) + len(nv))

alpha = nuv if nuv != 0 else 1
weighted_matrix[u][v] = alpha
weighted_matrix[v][u] = alpha
self.weighted_matrices.append(weighted_matrix)

def build_distance_matrix(self):
if self.user_weights:
self.create_weighted_matrix()

self.distance_matrix = np.zeros((len(self.users), len(self.users)))

for n, ts in enumerate(self.train_set_list):
self.train_set = ts

self.create_matrix()
# Missing: Treat distance matrix with feedback
self.distance_matrix += self.compute_distance()
if self.user_weights:
self.distance_matrix /= self.weighted_matrices[n]

del self.train_set
# del self.train_set_list

self.distance_matrix /= self.n_files

def run_kmedoids(self):
set_train_tuple = []
support_matrix, clusters = kmedoids(self.distance_matrix, self.k_groups,
max_interactions=self.max_int_kmedoids, random_seed=123)

for c, cluster in enumerate(clusters.values()):
self.k_users_in_cluster.append(len(cluster))
train_tuple = set()
for user_id in cluster:
user = self.user_id_to_user[user_id]
for tr in self.train_set_list:
for item in tr['feedback'].get(user, []):
train_tuple.add((user, item, 1))
train_tuple = sorted(list(train_tuple), key=lambda x: (x[0], x[1]))
if len(train_tuple) != 0:
set_train_tuple.append(train_tuple)

return set_train_tuple

def generate_groups(self):
fold_for_sets = self.dir_name + '/gb_train_' + str(self.parser) + '/'
if not os.path.exists(fold_for_sets):
os.mkdir(fold_for_sets)

train_tuple = self.run_kmedoids()
self.k_groups = len(train_tuple)
for f in range(len(train_tuple)):
train_file_name = fold_for_sets + 'train_%d.dat' % f
WriteFile(train_file_name, data=train_tuple[f], sep=self.sep).write()
self.gb_train_files.append(train_file_name)
del self.train_set_list

def generate_recommendation(self):
self.ranking = []
for n, train_file in enumerate(self.gb_train_files):
if self.recommender == 'UserKNN':
rec = UserKNN(train_file=train_file, similarity_metric=self.similarity_metric,
as_binary=True, as_similar_first=False)
rec.compute(verbose=False, verbose_evaluation=False)
self.ranking += rec.ranking

elif self.recommender == 'ItemKNN':
rec = ItemKNN(train_file=train_file, test_file=self.test_file,
similarity_metric=self.similarity_metric, as_binary=True)
rec.compute(verbose=False, verbose_evaluation=False)
self.ranking += rec.ranking

elif self.recommender == 'MostPopular':
rec = MostPopular(train_file=train_file, test_file=self.test_file, as_binary=True)
rec.compute(verbose=False, verbose_evaluation=False)
self.ranking += rec.ranking

elif self.recommender == 'BPRMF':
rec = BprMF(train_file=train_file, test_file=self.test_file, batch_size=4)
rec.compute(verbose=False, verbose_evaluation=False)
self.ranking += rec.ranking
else:
raise ValueError('Error: Recommender not implemented or not exist!')

self.ranking = sorted(self.ranking, key=lambda x: (x[0], -x[2]))

def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):

if verbose:
print("[Case Recommender: Item Recommendation > %s]\n" % self.recommender_name)

self.read_files()
self.build_distance_matrix()
self.generate_groups()
self.generate_recommendation()

if verbose:
print('GroupBased:: Final K value for kmedoids: %d' % self.k_groups)

self.write_ranking()

"""
raise NotImplemented
if self.test_file is not None:
self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
3 changes: 2 additions & 1 deletion caserec/recommenders/item_recommendation/itemknn.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def __init__(self, train_file=None, test_file=None, output_file=None, similarity
:param output_file: File with dir to write the final predictions
:type output_file: str, default None
:param similarity_metric:
:param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about
distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
:type similarity_metric: str, default cosine
:param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_items))
Expand Down

0 comments on commit 4b9b64b

Please sign in to comment.