In [1]:
import pandas as pd
import utils
import numpy as np
import scipy.sparse as sps


In [2]:
tracks = pd.read_csv('../input/tracks.csv')
train = pd.read_csv('../input/train.csv')
target = pd.read_csv('../input/target_playlists.csv')

In [3]:
icm_csr = utils.build_icm_csr(tracks)
urm_csr = utils.build_urm_csr(train)



In [4]:

from Notebooks_utils.data_splitter import train_test_holdout

URM_train, URM_test = train_test_holdout(urm_csr, train_perc = 0.8)
URM_train, URM_validation = train_test_holdout(URM_train, train_perc = 0.9)

In [5]:
from ParameterTuning.AbstractClassSearch import EvaluatorWrapper
from Base.Evaluation.Evaluator import SequentialEvaluator

evaluator_validation = SequentialEvaluator(URM_validation, cutoff_list=[5])
evaluator_test = SequentialEvaluator(URM_test, cutoff_list=[5, 10])

evaluator_validation = EvaluatorWrapper(evaluator_validation)
evaluator_test = EvaluatorWrapper(evaluator_test)


In [6]:
from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from ParameterTuning.BayesianSearch import BayesianSearch


recommender_class = ItemKNNCFRecommender

parameterSearch = BayesianSearch(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_test)



In [7]:
from ParameterTuning.AbstractClassSearch import DictionaryKeys

hyperparamethers_range_dictionary = {}
hyperparamethers_range_dictionary["topK"] = [5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800]
hyperparamethers_range_dictionary["shrink"] = [0, 10, 50, 100, 200, 300, 500, 1000]
hyperparamethers_range_dictionary["similarity"] = ["cosine"]
hyperparamethers_range_dictionary["normalize"] = [True, False]


recommenderDictionary = {DictionaryKeys.CONSTRUCTOR_POSITIONAL_ARGS: [URM_train],
                         DictionaryKeys.CONSTRUCTOR_KEYWORD_ARGS: {},
                         DictionaryKeys.FIT_POSITIONAL_ARGS: dict(),
                         DictionaryKeys.FIT_KEYWORD_ARGS: dict(),
                         DictionaryKeys.FIT_RANGE_KEYWORD_ARGS: hyperparamethers_range_dictionary}

output_root_path = "result_experiments/"

import os

# If directory does not exist, create
if not os.path.exists(output_root_path):
    os.makedirs(output_root_path)
    

output_root_path += recommender_class.RECOMMENDER_NAME



In [9]:
n_cases = 5
metric_to_optimize = "MAP"

best_parameters = parameterSearch.search(recommenderDictionary,
                                         n_cases = n_cases,
                                         output_root_path = output_root_path,
                                         metric=metric_to_optimize)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   normalize |    shrink |   similarity |      topK | 
BayesianSearch: Testing config: {'topK': 5, 'shrink': 500, 'similarity': 'cosine', 'normalize': False}
Similarity column 20635 ( 100 % ), 9571.31 column/sec, elapsed time 0.04 min
SequentialEvaluator: Processed 38269 ( 100.00% ) in 14.39 seconds. Users per second: 2660
BayesianSearch: New best config found. Config: {'topK': 5, 'shrink': 500, 'similarity': 'cosine', 'normalize': False} - results: {'ROC_AUC': 0.10182741470467831, 'PRECISION': 0.084814514794393628, 'RECALL': 0.08312515555323069, 'RECALL_TEST_LEN': 0.084814514794393628, 'MAP': 0.047570023314489472, 'MRR': 0.095719337671048288, 'NDCG': 0.067464221160742147, 'F1': 0.083961338269418484, 'HIT_RATE': 0.19153884345031227, 'ARHR': 0.10176513627217869, 'NOVELTY': 0.0027682672604753913, 'DIVERSITY_MEAN_INTER_LIST': 0.99081648262717747,

In [10]:
best_parameters

{'topK': 800, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}

In [13]:
itemKNNCF = ItemKNNCFRecommender(urm_csr)
itemKNNCF.fit(**best_parameters)

Similarity column 20635 ( 100 % ), 5955.92 column/sec, elapsed time 0.06 min


In [12]:
evaluator_test.evaluateRecommender(itemKNNCF)

SequentialEvaluator: Processed 34877 ( 73.42% ) in 30.00 seconds. Users per second: 1163
SequentialEvaluator: Processed 47505 ( 100.00% ) in 40.32 seconds. Users per second: 1178


({5: {'ROC_AUC': 0.23139494088341492,
   'PRECISION': 0.14576570887272647,
   'RECALL': 0.11832730370407955,
   'RECALL_TEST_LEN': 0.14576570887272647,
   'MAP': 0.092221158006757106,
   'MRR': 0.23678454899485046,
   'NDCG': 0.12011304590762847,
   'F1': 0.13062112575513374,
   'HIT_RATE': 0.54284812125039472,
   'ARHR': 0.28731712451323527,
   'NOVELTY': 0.0030106579326326767,
   'DIVERSITY_MEAN_INTER_LIST': 0.99378614876262861,
   'DIVERSITY_HERFINDAHL': 0.99875304583020696,
   'COVERAGE_ITEM': 0.60314998788466201,
   'COVERAGE_USER': 0.94170003568171912,
   'DIVERSITY_GINI': 0.19444261346247232,
   'SHANNON_ENTROPY': 11.187396920219596},
  10: {'ROC_AUC': 0.31036617198188654,
   'PRECISION': 0.18312969562101325,
   'RECALL': 0.17873479378393692,
   'RECALL_TEST_LEN': 0.18312969562101325,
   'MAP': 0.090390184137175061,
   'MRR': 0.25242103170427643,
   'NDCG': 0.15160492511658075,
   'F1': 0.18090555631120836,
   'HIT_RATE': 0.84784759499000106,
   'ARHR': 0.32748355645105287,
   '

In [14]:
target = target.get_values()[:,0]

In [15]:
n = 500
  
# using list comprehension 
divided_target = [target[i * n:(i + 1) * n] for i in range((len(target) + n - 1) // n )]  

In [17]:
result = []
for items in target:
    partial_recomendations = itemKNNCF.recommend(items,cutoff=10)
    result.append(partial_recomendations)

In [18]:
result = np.array(result)

In [19]:
result.reshape(-1,10)

array([[17154,  5924,  8749, ...,  4189, 10100, 11233],
       [ 4202,  7545, 19134, ...,  1422,  2018,  1111],
       [ 8001,  2159, 12075, ...,  3903,  8138, 18361],
       ..., 
       [12466, 18956,  4986, ...,  3102,   932,  4552],
       [14907, 10521,  4774, ..., 12075,  8880, 18404],
       [ 6492, 19152, 20488, ...,  1593, 15865, 18254]])

In [20]:
result = result.reshape(-1,10)

In [21]:
result.shape

(10000, 10)

In [22]:
i = 0
results = []
for item in result:
    recommended_items = " ".join(str(i) for i in item)
    temp = [target[i],recommended_items]
    results.append(temp)
    i += 1

In [23]:
rec = pd.DataFrame(results)
rec.to_csv("itemKNNCF_submission.csv", index = False, header = ["playlist_id", "track_ids"])