# Imports

In [10]:
import sys
sys.path.append("..")

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

from Base.Evaluation.Evaluator import EvaluatorHoldout

from Data_manager.MMTF14K.URM5Fold_WarmCold_Reader import URM5Fold_WarmCold_Reader
from Data_manager.MMTF14K.URM5Fold_WarmCold_Splitter import URM5Fold_WarmCold_Splitter

# Dataset selection and splitting

### Dataset is downloaded and stored locally if absent

In [11]:
# Selecting a dataset
dataReader = URM5Fold_WarmCold_Reader()

# Splitting the dataset. This split will produce a warm item split
# To replicate the original experimens use the dataset accessible here with a cold item split:
# https://mmprj.github.io/mtrm_dataset/index
dataSplitter = URM5Fold_WarmCold_Splitter(dataReader)
dataSplitter.load_data()

URM5Fold_WarmCold_Splitter for DataReader: MMTF14K/Final_MMTF14K_Web/Data
	 Num items: 13623
	 Num users: 138492

	 Statistics for fold 0: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 1: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 2: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 3: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 4: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03


	 Statistics for ICM_1: n_features 100, feature occurrences 1362300, density: 1.00E+00
	 Statistics for ICM_2: n_features 100, feature occurrences 1362300, density: 1.00E+00
	 Statistics for ICM_3: n_features 100, feature occurrences 1362300, density: 1.00E+00
	 Statistics for ICM_4: n_features 100, feature occurrences 1362300, density: 1.00E+00
	 Statistics for

# Setup URM and ICM

In [12]:
# Each URM is a scipy.sparse matrix of shape |users|x|items|
URM_train = [
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=1)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=2)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=3)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=4)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=5)[0]
]

URM_test = [
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=1)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=2)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=3)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=4)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=5)[1]
]

# The ICM is a scipy.sparse matrix of shape |items|x|features|
ICM = [
    dataSplitter.get_ICM_from_name("ICM_1"),
    dataSplitter.get_ICM_from_name("ICM_2"),
    dataSplitter.get_ICM_from_name("ICM_3"),
    dataSplitter.get_ICM_from_name("ICM_4"),
    dataSplitter.get_ICM_from_name("ICM_5")
]

# This contains the items to be ignored during the evaluation step
# In a cold items setting this should contain the indices of the warm items
ignore_items = []

# CBF Setup parameters (TODO: Parameter tuning)

In [13]:
# We compute the similarity matrix resulting from an Item KNN CBF Recommender
# Note that we have not included the code for parameter tuning, which should be done

cbf_parameters = {
                    'topK': 500,
                    'shrink': 100,
                    'similarity': 'cosine',
                    'normalize': True,
                    'feature_weighting': 'none' # Other options are BM25 and TF-IDF
                 }

# CBF Recommender training and evaluation

In [27]:
overall_result_dict = {}
for fold in range(0,len(ICM)):
    
    evaluator_test = EvaluatorHoldout(URM_test[fold], cutoff_list=[5], ignore_items=ignore_items)
    recommender_content_based = ItemKNNCBFRecommender(ICM[fold],URM_train[fold])
    recommender_content_based.fit(**cbf_parameters)
    
    result_dict, result_string = evaluator_test.evaluateRecommender(recommender_content_based)
    for metric,value in list(result_dict.values())[0].items():
        currentValue = overall_result_dict.get(metric, 0)
        overall_result_dict[metric] = currentValue + value
        
for metric, value in overall_result_dic.items():
    overall_result_dict[metric] = value/len(ICM)

Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2025.73 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 43821 ( 32.04% ) in 30.00 sec. Users per second: 1461
EvaluatorHoldout: Processed 96001 ( 70.19% ) in 1.00 min. Users per second: 1599
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 1.33 min. Users per second: 1712


AttributeError: 'dict' object has no attribute 'append'

In [24]:
print("CBF recommendation quality is: {}".format(result_string))

import json
json = json.dumps(list(overall_result_dict.values())[0], indent=4)
f = open("Results/CBF - ColdItemSplit - results.json","w")
f.write(json)
f.close()

CBF recommendation quality is: CUTOFF: 5 - ROC_AUC: 0.0059390, PRECISION: 0.0031038, PRECISION_RECALL_MIN_DEN: 0.0032148, RECALL: 0.0008297, MAP: 0.0012302, MRR: 0.0055383, NDCG: 0.0009911, F1: 0.0013094, HIT_RATE: 0.0155191, ARHR: 0.0057532, RMSE: 0.9999483, NOVELTY: 0.0043637, DIVERSITY_MEAN_INTER_LIST: 0.8822034, DIVERSITY_HERFINDAHL: 0.9764394, COVERAGE_ITEM: 0.3933788, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.0650325, SHANNON_ENTROPY: 7.4979640, 

ROC_AUC
0.005938997566054643
PRECISION
0.003103829618942189
PRECISION_RECALL_MIN_DEN
0.0032147853025830037
RECALL
0.0008297023586782257
MAP
0.0012302430120704373
MRR
0.005538263232289247
NDCG
0.000991077010376108
F1
0.0013093854431200376
HIT_RATE
0.015519148094711478
ARHR
0.005753216982444889
RMSE
0.9999483381817624
NOVELTY
0.0043636783536005475
DIVERSITY_MEAN_INTER_LIST
0.8822033589581597
DIVERSITY_HERFINDAHL
0.9764393795741164
COVERAGE_ITEM
0.39337884460104233
COVERAGE_USER
1.0
DIVERSITY_GINI
0.06503254612605015
SHANNON_ENTROPY
7.49