# Imports

In [4]:
import sys
sys.path.append("..")

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

from Base.Evaluation.Evaluator import EvaluatorHoldout

from Data_manager.MMTF14K.URM5Fold_WarmCold_Reader import URM5Fold_WarmCold_Reader
from Data_manager.MMTF14K.URM5Fold_WarmCold_Splitter import URM5Fold_WarmCold_Splitter

# Dataset selection and splitting

### Dataset is downloaded and stored locally if absent

In [5]:
# Selecting a dataset
dataReader = URM5Fold_WarmCold_Reader()

# Splitting the dataset. This split will produce a warm item split
# To replicate the original experimens use the dataset accessible here with a cold item split:
# https://mmprj.github.io/mtrm_dataset/index
dataSplitter = URM5Fold_WarmCold_Splitter(dataReader)
dataSplitter.load_data()

URM5Fold_WarmCold_Splitter for DataReader: MMTF14K/Final_MMTF14K_Web/Data
	 Num items: 13623
	 Num users: 138492

	 Statistics for fold 0: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 1: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 2: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 3: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03
	 Statistics for fold 4: n_interactions 12471739 ( 20.00%), n_items 13623 ( 100.00%), density: 6.61E-03


	 Statistics for ICM_genre: n_features 18, feature occurrences 27099, density: 1.11E-01
	 Statistics for ICM_year: n_features 1, feature occurrences 13623, density: 1.00E+00
	 Statistics for ICM_AVF: n_features 107, feature occurrences 1401318, density: 9.61E-01
	 Statistics for ICM_deep: n_features 4096, feature occurrences 55799808, density: 1.00E+00
	 Statist

# Setup URM and ICM

In [9]:
# Each URM is a scipy.sparse matrix of shape |users|x|items|
URM_train = [
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=1)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=2)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=3)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=4)[0],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=5)[0]
]

URM_test = [
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=1)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=2)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=3)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=4)[1],
    dataSplitter.get_URM_train_for_test_fold(n_test_fold=5)[1]
]

# The ICM is a scipy.sparse matrix of shape |items|x|features|
ICM = [
    ("ICM_genre", dataSplitter.get_ICM_from_name("ICM_genre")),
    ("ICM_year", dataSplitter.get_ICM_from_name("ICM_year")),
    # ("ICM_BLF", dataSplitter.get_ICM_from_name("ICM_BLF")),
    ("ICM_ivector_1", dataSplitter.get_ICM_from_name("ICM_ivector_1")),
    ("ICM_AVF", dataSplitter.get_ICM_from_name("ICM_AVF")),
    ("ICM_deep", dataSplitter.get_ICM_from_name("ICM_deep"))
]

# This contains the items to be ignored during the evaluation step
# In a cold items setting this should contain the indices of the warm items
ignore_items = []

# CBF Recommender training and evaluation

In [None]:
# Iterate over the different unimodals ICMs
for ICM_item in ICM:
    
    # Iterate over HYPER-PARAMS values
    topKValues = [100, 500, 1000]
    shrinkValues = [10, 100, 200]

    for topKValue in topKValues:
        for shrinkValue in shrinkValues:
            cbf_parameters = {
                        'topK': topKValue,
                        'shrink': shrinkValue,
                        'similarity': 'cosine',
                        'normalize': True,
                        'feature_weighting': 'none' # Other options are BM25 and TF-IDF
                     }

            overall_result_dict = {}
            for fold in range(0,len(URM_train)):

                evaluator_test = EvaluatorHoldout(URM_test[fold], cutoff_list=[5], ignore_items=ignore_items)
                recommender_content_based = ItemKNNCBFRecommender(ICM_item[1],URM_train[fold])
                recommender_content_based.fit(**cbf_parameters)

                result_dict, result_string = evaluator_test.evaluateRecommender(recommender_content_based)
                for metric,value in list(result_dict.values())[0].items():
                    currentValue = overall_result_dict.get(metric, 0)
                    overall_result_dict[metric] = currentValue + value

            for metric, value in overall_result_dict.items():
                overall_result_dict[metric] = value/len(URM_train)

            print("CBF recommendation quality is: {}".format(result_string))

            import json
            json = json.dumps(overall_result_dict, indent=4)
            filename = "Results/CBF - ColdItemSplit/ICMtype_{}-topK_{}-shrink_{} - results.json".format(ICM_item[0],topKValue,shrinkValue)
            f = open(filename,"w")
            f.write(json)
            f.close()
        

Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 4411.43 column/sec, elapsed time 0.05 min
EvaluatorHoldout: Processed 71074 ( 51.97% ) in 30.00 sec. Users per second: 2369
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 53.46 sec. Users per second: 2558
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 4858.36 column/sec, elapsed time 0.05 min
EvaluatorHoldout: Processed 72986 ( 53.01% ) in 30.00 sec. Users per second: 2433
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 53.40 sec. Users per second: 2578
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 4956.77 column/sec, elapsed time 0.05 min
EvaluatorHoldout: Processed 72955 ( 53.04% ) in 30.00 sec. Users per second: 2432
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 53.09 sec. Users per second: 2591
Ignoring 0 Items
Unable to load Cy

CBF recommendation quality is: CUTOFF: 5 - ROC_AUC: 0.0014440, PRECISION: 0.0006006, PRECISION_RECALL_MIN_DEN: 0.0006237, RECALL: 0.0001734, MAP: 0.0002756, MRR: 0.0013254, NDCG: 0.0002225, F1: 0.0002691, HIT_RATE: 0.0030028, ARHR: 0.0013332, RMSE: 0.8565388, NOVELTY: 0.0052738, DIVERSITY_MEAN_INTER_LIST: 0.9891563, DIVERSITY_HERFINDAHL: 0.9978298, COVERAGE_ITEM: 0.5335829, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.1474860, SHANNON_ENTROPY: 10.1771711, 

Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 3730.54 column/sec, elapsed time 0.06 min
EvaluatorHoldout: Processed 52001 ( 38.02% ) in 30.20 sec. Users per second: 1722
EvaluatorHoldout: Processed 114995 ( 84.08% ) in 1.00 min. Users per second: 1910
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 1.16 min. Users per second: 1965
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 3784.40 column/sec, el

EvaluatorHoldout: Processed 136768 ( 100.00% ) in 1.35 min. Users per second: 1690
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 3022.29 column/sec, elapsed time 0.08 min
EvaluatorHoldout: Processed 44835 ( 32.57% ) in 30.00 sec. Users per second: 1494
EvaluatorHoldout: Processed 100001 ( 72.63% ) in 1.00 min. Users per second: 1659
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 1.31 min. Users per second: 1757
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % ), 3022.13 column/sec, elapsed time 0.08 min
EvaluatorHoldout: Processed 42425 ( 30.85% ) in 30.00 sec. Users per second: 1414
EvaluatorHoldout: Processed 94878 ( 68.98% ) in 1.00 min. Users per second: 1581
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 1.36 min. Users per second: 1688
Ignoring 0 Items
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 13623 ( 100 % )

Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 5522.76 column/sec, elapsed time 0.04 min
EvaluatorHoldout: Processed 99034 ( 72.41% ) in 30.00 sec. Users per second: 3301
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 40.93 sec. Users per second: 3342
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 5642.88 column/sec, elapsed time 0.04 min
EvaluatorHoldout: Processed 100001 ( 72.63% ) in 30.01 sec. Users per second: 3333
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 40.91 sec. Users per second: 3366
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 5680.64 column/sec, elapsed time 0.04 min
EvaluatorHoldout: Processed 100064 ( 72.75% ) in 30.00 sec. Users per second: 3335
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 40.87 sec. Users per second: 3365
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 5641.08 column/

Similarity column 13623 ( 100 % ), 3082.50 column/sec, elapsed time 0.07 min
EvaluatorHoldout: Processed 73001 ( 53.02% ) in 30.10 sec. Users per second: 2425
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 54.34 sec. Users per second: 2534
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 3061.07 column/sec, elapsed time 0.07 min
EvaluatorHoldout: Processed 72394 ( 52.64% ) in 30.00 sec. Users per second: 2413
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 54.01 sec. Users per second: 2547
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 3056.80 column/sec, elapsed time 0.07 min
EvaluatorHoldout: Processed 72106 ( 52.37% ) in 30.00 sec. Users per second: 2404
EvaluatorHoldout: Processed 137693 ( 100.00% ) in 54.50 sec. Users per second: 2526
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 3070.33 column/sec, elapsed time 0.07 min
EvaluatorHoldout:

EvaluatorHoldout: Processed 63357 ( 46.06% ) in 30.00 sec. Users per second: 2112
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 58.19 sec. Users per second: 2364
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2417.30 column/sec, elapsed time 0.09 min
EvaluatorHoldout: Processed 62671 ( 45.52% ) in 30.00 sec. Users per second: 2089
EvaluatorHoldout: Processed 137693 ( 100.00% ) in 58.55 sec. Users per second: 2352
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2395.45 column/sec, elapsed time 0.09 min
EvaluatorHoldout: Processed 63001 ( 46.14% ) in 30.04 sec. Users per second: 2097
EvaluatorHoldout: Processed 136541 ( 100.00% ) in 58.42 sec. Users per second: 2337
CBF recommendation quality is: CUTOFF: 5 - ROC_AUC: 0.0014477, PRECISION: 0.0005903, PRECISION_RECALL_MIN_DEN: 0.0006092, RECALL: 0.0001556, MAP: 0.0002866, MRR: 0.0013687, NDCG: 0.0002205, F1: 0.0002463, HIT_RATE: 0.0029515, ARHR

EvaluatorHoldout: Processed 136768 ( 100.00% ) in 1.28 min. Users per second: 1783
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2025.84 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 43001 ( 31.23% ) in 30.36 sec. Users per second: 1416
EvaluatorHoldout: Processed 95001 ( 69.00% ) in 1.01 min. Users per second: 1574
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 1.36 min. Users per second: 1686
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2023.93 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 43001 ( 31.26% ) in 30.37 sec. Users per second: 1416
EvaluatorHoldout: Processed 95001 ( 69.07% ) in 1.01 min. Users per second: 1566
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 1.37 min. Users per second: 1676
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2033.78 column/sec, elapsed time 0.11 min
EvaluatorHoldout:

EvaluatorHoldout: Processed 35001 ( 25.42% ) in 30.07 sec. Users per second: 1164
EvaluatorHoldout: Processed 79001 ( 57.38% ) in 1.01 min. Users per second: 1307
EvaluatorHoldout: Processed 127001 ( 92.25% ) in 1.51 min. Users per second: 1403
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 1.60 min. Users per second: 1433
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 1682.59 column/sec, elapsed time 0.13 min
EvaluatorHoldout: Processed 34001 ( 24.72% ) in 30.50 sec. Users per second: 1115
EvaluatorHoldout: Processed 75001 ( 54.53% ) in 1.01 min. Users per second: 1239
EvaluatorHoldout: Processed 121536 ( 88.36% ) in 1.51 min. Users per second: 1342
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 1.66 min. Users per second: 1381
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 1654.79 column/sec, elapsed time 0.14 min
EvaluatorHoldout: Processed 36001 ( 26.15% ) in 30.38 sec. Users per sec

  this_column_weights = np.multiply(this_column_weights, 1 / denominator)


Similarity column 13623 ( 100 % ), 2404.05 column/sec, elapsed time 0.09 min
EvaluatorHoldout: Processed 64001 ( 46.80% ) in 30.14 sec. Users per second: 2123
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 57.47 sec. Users per second: 2380
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2412.02 column/sec, elapsed time 0.09 min
EvaluatorHoldout: Processed 65001 ( 47.21% ) in 30.10 sec. Users per second: 2159
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 57.25 sec. Users per second: 2405
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2406.45 column/sec, elapsed time 0.09 min
EvaluatorHoldout: Processed 63993 ( 46.53% ) in 30.00 sec. Users per second: 2133
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 57.93 sec. Users per second: 2374
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2413.87 column/sec, elapsed time 0.09 min
EvaluatorHoldout:

Similarity column 13623 ( 100 % ), 2011.24 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 44001 ( 32.17% ) in 30.35 sec. Users per second: 1450
EvaluatorHoldout: Processed 97883 ( 71.57% ) in 1.01 min. Users per second: 1622
EvaluatorHoldout: Processed 136768 ( 100.00% ) in 1.32 min. Users per second: 1730
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2076.32 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 45001 ( 32.69% ) in 30.31 sec. Users per second: 1485
EvaluatorHoldout: Processed 99351 ( 72.16% ) in 1.01 min. Users per second: 1647
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 1.31 min. Users per second: 1755
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 2004.61 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 45001 ( 32.72% ) in 30.10 sec. Users per second: 1495
EvaluatorHoldout: Processed 100001 ( 72.71% ) in 1.01 min. Users per second: 

Similarity column 13623 ( 100 % ), 1733.06 column/sec, elapsed time 0.13 min
EvaluatorHoldout: Processed 37001 ( 26.90% ) in 30.52 sec. Users per second: 1212
EvaluatorHoldout: Processed 83001 ( 60.35% ) in 1.01 min. Users per second: 1367
EvaluatorHoldout: Processed 136001 ( 98.88% ) in 1.52 min. Users per second: 1496
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 1.52 min. Users per second: 1504
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 1727.96 column/sec, elapsed time 0.13 min
EvaluatorHoldout: Processed 37001 ( 26.87% ) in 30.15 sec. Users per second: 1227
EvaluatorHoldout: Processed 85001 ( 61.73% ) in 1.01 min. Users per second: 1404
EvaluatorHoldout: Processed 137693 ( 100.00% ) in 1.50 min. Users per second: 1530
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 13623 ( 100 % ), 1744.99 column/sec, elapsed time 0.13 min
EvaluatorHoldout: Processed 37001 ( 27.10% ) in 30.47 sec. Users per second: 

Similarity column 6700 ( 49 % ), 109.53 column/sec, elapsed time 1.02 min
Similarity column 10100 ( 74 % ), 109.95 column/sec, elapsed time 1.53 min
Similarity column 13600 ( 100 % ), 110.86 column/sec, elapsed time 2.04 min
Similarity column 13623 ( 100 % ), 110.70 column/sec, elapsed time 2.05 min
EvaluatorHoldout: Processed 65371 ( 47.48% ) in 30.00 sec. Users per second: 2179
EvaluatorHoldout: Processed 137677 ( 100.00% ) in 56.81 sec. Users per second: 2423
Ignoring 0 Items
Compute_Similarity: detected dense matrix
Similarity column 3400 ( 25 % ), 111.41 column/sec, elapsed time 0.51 min
Similarity column 6900 ( 51 % ), 112.71 column/sec, elapsed time 1.02 min
Similarity column 10400 ( 76 % ), 113.24 column/sec, elapsed time 1.53 min
Similarity column 13623 ( 100 % ), 113.48 column/sec, elapsed time 2.00 min
EvaluatorHoldout: Processed 64001 ( 46.53% ) in 30.02 sec. Users per second: 2132
EvaluatorHoldout: Processed 137539 ( 100.00% ) in 57.49 sec. Users per second: 2392
Ignoring 