# Imports

In [1]:
import sys
sys.path.append("..")

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

from Base.Evaluation.Evaluator import EvaluatorHoldout

from Data_manager.MMTF14K.URM5Fold_WarmCold_Reader import URM5Fold_WarmCold_Reader
from Data_manager.MMTF14K.URM5Fold_WarmCold_Splitter import URM5Fold_WarmCold_Splitter

# Dataset selection and splitting

### Dataset is downloaded and stored locally if absent

In [2]:
# Selecting a dataset
dataReader = URM5Fold_WarmCold_Reader()

# Splitting the dataset. This split will produce a warm item split
# To replicate the original experimens use the dataset accessible here with a cold item split:
# https://mmprj.github.io/mtrm_dataset/index
dataSplitter = URM5Fold_WarmCold_Splitter(dataReader)
dataSplitter.load_data()

URM5Fold_WarmCold_Splitter for DataReader: MMTF14K/Final_MMTF14K_Web/Data
	 Num items: 13623
	 Num users: 138492

	 Statistics for fold 0: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 1: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 2: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 3: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 4: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03


	 Statistics for ICM_1: n_features 18, feature occurrences 245214, density: 1.00E+00
	 Statistics for ICM_2: n_features 18, feature occurrences 245214, density: 1.00E+00
	 Statistics for ICM_3: n_features 18, feature occurrences 245214, density: 1.00E+00
	 Statistics for ICM_4: n_features 18, feature occurrences 245214, density: 1.00E+00
	 Statistics for ICM_5: 

# Setup URM and ICM

In [3]:
# TODO iterate over the 5 folds to perform 5-fold cross validation,
# then evaluate and calculate performance metrics as the average on all the folds

# Each URM is a scipy.sparse matrix of shape |users|x|items|
URM_train, URM_test = dataSplitter.get_URM_train_for_test_fold(n_test_fold=1)

# The ICM is a scipy.sparse matrix of shape |items|x|features|
ICM = dataSplitter.get_ICM_from_name("ICM_1")

# This contains the items to be ignored during the evaluation step
# In a cold items setting this should contain the indices of the warm items
ignore_items = []

# Setup evaluators

In [4]:
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5], ignore_items=ignore_items)

Ignoring 0 Items


# CBF Setup parameters (TODO: Parameter tuning)

In [5]:
# We compute the similarity matrix resulting from an Item KNN CBF Recommender
# Note that we have not included the code for parameter tuning, which should be done

cbf_parameters = {
                    'topK': 500,
                    'shrink': 100,
                    'similarity': 'cosine',
                    'normalize': True,
                    'feature_weighting': 'none' # Other options are BM25 and TF-IDF
                 }

# CBF Recommender training and evaluation

In [6]:
recommender_content_based = ItemKNNCBFRecommender(ICM,URM_train)
recommender_content_based.fit(**cbf_parameters)

result_dict, result_string = evaluator_test.evaluateRecommender(recommender_content_based)

Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2056.44 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 45001 ( 32.69% ) in 30.34 sec. Users per second: 1483
EvaluatorHoldout: Processed 99001 ( 71.91% ) in 1.01 min. Users per second: 1637
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 1.33 min. Users per second: 1725


In [7]:
print("CBF recommendation quality is: {}".format(result_string))

import json
json = json.dumps(list(result_dict.values())[0], indent=4)
f = open("Results/CBF - ColdItemSplit - results.json","w")
f.write(json)
f.close()

CBF recommendation quality is: CUTOFF: 5 - ROC_AUC: 0.0005865, PRECISION: 0.0003123, PRECISION_RECALL_MIN_DEN: 0.0003177, RECALL: 0.0000795, MAP: 0.0001137, MRR: 0.0005592, NDCG: 0.0000954, F1: 0.0001268, HIT_RATE: 0.0015616, ARHR: 0.0005592, RMSE: 0.9998754, NOVELTY: 0.0057565, DIVERSITY_MEAN_INTER_LIST: 0.9304599, DIVERSITY_HERFINDAHL: 0.9860906, COVERAGE_ITEM: 0.1601703, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.0744415, SHANNON_ENTROPY: 7.4190806, 

