# Imports

In [8]:
%reload_ext Cython

In [9]:
import sys
sys.path.append("..")

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

from Base.Evaluation.Evaluator import EvaluatorHoldout

from Data_manager.Movielens_20m.Movielens20MReader import Movielens20MReader
from Data_manager.DataSplitter_k_fold import DataSplitter_Warm_k_fold

# Dataset selection and splitting (TODO: Switch to cold item dataset)

### Dataset is downloaded and stored locally if absent

In [10]:
# Selecting a dataset
dataReader = Movielens20MReader()

# Splitting the dataset. This split will produce a warm item split
# To replicate the original experimens use the dataset accessible here with a cold item split:
# https://mmprj.github.io/mtrm_dataset/index
dataSplitter = DataSplitter_Warm_k_fold(dataReader)
dataSplitter.load_data()

DataSplitter_k_fold for DataReader: Movielens_20m
	 Num items: 27278
	 Num users: 138493

	 Statistics for fold 0: n_interactions 3947336 ( 19.74%), n_items 27278 ( 100.00%), density: 1.04E-03
	 Statistics for fold 1: n_interactions 3999236 ( 20.00%), n_items 27278 ( 100.00%), density: 1.06E-03
	 Statistics for fold 2: n_interactions 4000004 ( 20.00%), n_items 27278 ( 100.00%), density: 1.06E-03
	 Statistics for fold 3: n_interactions 3999236 ( 20.00%), n_items 27278 ( 100.00%), density: 1.06E-03
	 Statistics for fold 4: n_interactions 4054451 ( 20.27%), n_items 27278 ( 100.00%), density: 1.07E-03


	 Statistics for ICM_genre: n_features 20, feature occurrences 54406, density: 9.97E-02


DataSplitter_k_fold: Done.


# Setup URM and ICM (TODO: Ignore warm items in cold items setting)

In [11]:
# Each URM is a scipy.sparse matrix of shape |users|x|items|
URM_train, URM_validation, URM_test = dataSplitter.get_holdout_split()

# The ICM is a scipy.sparse matrix of shape |items|x|features|
ICM = dataSplitter.get_ICM_from_name("ICM_genre")

# This contains the items to be ignored during the evaluation step
# In a cold items setting this should contain the indices of the warm items
ignore_items = []

DataSplitter: Generating holdout split... 
DataSplitter: Generating holdout split... done!


# Setup evaluators

In [12]:
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5], ignore_items=ignore_items)
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5], ignore_items=ignore_items)

Ignoring 0 Items
Ignoring 0 Items


# CBF Setup parameters (TODO: Parameter tuning)

In [13]:
# We compute the similarity matrix resulting from an Item KNN CBF Recommender
# Note that we have not included the code for parameter tuning, which should be done

cbf_parameters = {
                    'topK': 500,
                    'shrink': 100,
                    'similarity': 'cosine',
                    'normalize': True,
                    'feature_weighting': 'none' # Other options are BM25 and TF-IDF
                 }

# CBF Recommender training and evaluation

In [15]:
recommender_content_based = ItemKNNCBFRecommender(ICM,URM_train)
recommender_content_based.fit(**cbf_parameters)

result_dict, result_string = evaluator_test.evaluateRecommender(recommender_content_based)

Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 27278 ( 100 % ), 1933.66 column/sec, elapsed time 0.24 min
EvaluatorHoldout: Processed 35001 ( 25.27% ) in 30.10 sec. Users per second: 1163
EvaluatorHoldout: Processed 71001 ( 51.27% ) in 1.01 min. Users per second: 1176
EvaluatorHoldout: Processed 107894 ( 77.91% ) in 1.51 min. Users per second: 1194
EvaluatorHoldout: Processed 138493 ( 100.00% ) in 1.92 min. Users per second: 1200


In [31]:
print("CBF recommendation quality is: {}".format(result_string))

import json
json = json.dumps(list(result_dict.values())[0], indent=4)
f = open("CBF_results.json","w")
f.write(json)
f.close()

CBF recommendation quality is: CUTOFF: 5 - ROC_AUC: 0.0226997, PRECISION: 0.0112063, PRECISION_RECALL_MIN_DEN: 0.0112439, RECALL: 0.0019956, MAP: 0.0050649, MRR: 0.0217916, NDCG: 0.0026370, F1: 0.0033879, HIT_RATE: 0.0560317, ARHR: 0.0234195, RMSE: 3.4841707, NOVELTY: 0.0028122, DIVERSITY_MEAN_INTER_LIST: 0.9761945, DIVERSITY_HERFINDAHL: 0.9952375, COVERAGE_ITEM: 0.3306327, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.0944173, SHANNON_ENTROPY: 9.4980216, 

