In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

# Data processing and basic tuning setup

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 85/15
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)



In [8]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13629 ( 0.2%) Users that have less than 1 test interactions


In [9]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

#try a simple CF model based on item-item similarity, here just select the class of model to perform tuning
recommender_class = ItemKNNCFRecommender

In [10]:
from skopt.space import Real, Integer, Categorical

#define hyperparameter set for our model
#ItemKNNCF uses topK (K param), shrink term (to consider support of similarity), similarity type (we consider cosine one), normalization of data (true, false)
hyperparameters_range_dictionary = {
    "topK": Integer(5, 1000),
    "shrink": Integer(0, 1000),
    "similarity": Categorical(["cosine"]),
    "normalize": Categorical([True, False]),
    "feature_weighting": Categorical(["TF-IDF"])
}

In [11]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [12]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [13]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [14]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [15]:
%load_ext Cython

In [16]:
from Recommenders.Similarity import *

#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 964, 'shrink': 848, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'}
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 366.49 column/sec. Elapsed time 49.28 sec
EvaluatorHoldout: Processed 13629 (100.0%) in 2.01 min. Users per second: 113
SearchBayesianSkopt: New best config found. Config 0: {'topK': 964, 'shrink': 848, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'} - results: PRECISION: 0.2394600, PRECISION_RECALL_MIN_DEN: 0.2413865, RECALL: 0.0513334, MAP: 0.1341396, MAP_MIN_DEN: 0.1351546, MRR: 0.5197211, NDCG: 0.2625782, F1: 0.0845431, HIT_RATE: 0.8744589, ARHR_ALL_HITS: 0.8249404, NOVELTY: 0.0051627, AVERAGE_POPULARITY: 0.7418732, DIVERSITY_MEAN_INTER_LIST: 0.6454123, DIVERSITY_HERFINDAHL: 0.9645365, COVERAGE_ITEM: 0.0073094, COVERAGE_ITEM_CORRECT: 0.0035993, COVERAGE_USER: 0.

In [17]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['time_on_train_avg', 'hyperparameters_df', 'time_df', 'cutoff_to_optimize', 'time_on_test_avg', 'hyperparameters_best_index', 'result_on_last', 'metric_to_optimize', 'result_on_validation_best', 'hyperparameters_best', 'time_on_validation_total', 'time_on_test_total', 'exception_list', 'algorithm_name_search', 'result_on_test_df', 'time_on_train_total', 'result_on_validation_df', 'algorithm_name_recommender', 'time_on_validation_avg', 'result_on_test_best', 'time_on_last_df'])

In [18]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,shrink,similarity,normalize,feature_weighting
0,964,848,cosine,False,TF-IDF
1,399,760,cosine,True,TF-IDF
2,634,23,cosine,True,TF-IDF
3,865,43,cosine,True,TF-IDF
4,127,529,cosine,True,TF-IDF
5,132,926,cosine,False,TF-IDF
6,881,725,cosine,True,TF-IDF
7,145,459,cosine,False,TF-IDF
8,638,463,cosine,False,TF-IDF
9,943,294,cosine,False,TF-IDF


In [19]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_ITEM_CORRECT,COVERAGE_USER,COVERAGE_USER_CORRECT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.23946,0.241386,0.051333,0.13414,0.135155,0.519721,0.262578,0.084543,0.874459,0.82494,...,0.003599,0.998462,0.873114,0.001467,5.000558,0.964909,0.005915,0.403592,3.645285,0.022862
1,10,0.280079,0.282337,0.062781,0.158245,0.159302,0.552311,0.299897,0.10257,0.920097,0.924336,...,0.02071,0.998462,0.918681,0.003171,6.142963,0.979515,0.012786,0.495795,3.318015,0.023381
2,10,0.277922,0.280175,0.062044,0.156814,0.157887,0.551518,0.298013,0.101442,0.917015,0.919468,...,0.021596,0.998462,0.915604,0.003012,6.052846,0.978338,0.012147,0.488522,3.346807,0.023361
3,10,0.275317,0.277514,0.061173,0.155886,0.156836,0.550359,0.29601,0.100104,0.915328,0.915584,...,0.019713,0.998462,0.913919,0.002835,5.981647,0.977705,0.011434,0.482775,3.368687,0.023328
4,10,0.287673,0.290087,0.065128,0.161486,0.162548,0.549494,0.304938,0.106211,0.925086,0.931352,...,0.029514,0.998462,0.923663,0.004463,6.585724,0.983635,0.017998,0.53153,3.194767,0.023568
5,10,0.243987,0.245936,0.052663,0.136818,0.137889,0.524986,0.267036,0.086628,0.880842,0.837509,...,0.004984,0.998462,0.879487,0.001564,5.097607,0.966579,0.006308,0.411425,3.624102,0.022894
6,10,0.27465,0.276851,0.060907,0.155219,0.15619,0.548125,0.295062,0.099703,0.915328,0.911897,...,0.018495,0.998462,0.913919,0.002664,5.912532,0.977079,0.010745,0.477197,3.393144,0.023272
7,10,0.244119,0.246125,0.05273,0.137193,0.138241,0.52646,0.267538,0.086727,0.879888,0.840026,...,0.004762,0.998462,0.878535,0.001562,5.097562,0.966558,0.006298,0.411421,3.624368,0.022894
8,10,0.240018,0.241924,0.051531,0.134527,0.135568,0.520848,0.263243,0.084846,0.875046,0.82712,...,0.003876,0.998462,0.8737,0.001477,5.01117,0.965121,0.005956,0.404449,3.642831,0.022865
9,10,0.239526,0.241445,0.051337,0.1342,0.135213,0.520228,0.262686,0.084552,0.874679,0.825424,...,0.003599,0.998462,0.873333,0.001467,5.001221,0.964921,0.005918,0.403645,3.645086,0.022862


In [20]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 157,
 'shrink': 463,
 'similarity': 'cosine',
 'normalize': True,
 'feature_weighting': 'TF-IDF'}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [21]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = ItemKNNCFRecommender(URM_all)
recommender.fit(shrink=999, topK=407, feature_weighting = 'TF-IDF', similarity = 'cosine', normalize=True)
#evaluator_valid.evaluateRecommender(recommender)

Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 363.32 column/sec. Elapsed time 49.71 sec


# Create final recommendations

In [22]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [23]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [24]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)