In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

# Data processing and basic tuning setup

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [8]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13643 ( 0.1%) Users that have less than 1 test interactions


In [9]:
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender

#try a SLIM BPR model
recommender_class = P3alphaRecommender

In [10]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 200
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [11]:
from skopt.space import Real, Integer, Categorical
#SLIM BPR is machine learning-based technique
#to tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "alpha": Real(low=0, high=1, prior='uniform'),
    "topK": Integer(1, 800)
}

In [12]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [13]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [14]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [15]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'alpha': 0.5210082558849216, 'topK': 483}
EvaluatorHoldout: Processed 13643 (100.0%) in 17.18 sec. Users per second: 794
SearchBayesianSkopt: New best config found. Config 0: {'alpha': 0.5210082558849216, 'topK': 483} - results: PRECISION: 0.3111193, PRECISION_RECALL_MIN_DEN: 0.3123458, RECALL: 0.0514232, MAP: 0.1879813, MAP_MIN_DEN: 0.1887132, MRR: 0.6030181, NDCG: 0.3358749, F1: 0.0882586, HIT_RATE: 0.9313934, ARHR_ALL_HITS: 1.0463322, NOVELTY: 0.0051661, AVERAGE_POPULARITY: 0.7445005, DIVERSITY_MEAN_INTER_LIST: 0.6476149, DIVERSITY_HERFINDAHL: 0.9647567, COVERAGE_ITEM: 0.0130683, COVERAGE_ITEM_CORRECT: 0.0069771, COVERAGE_USER: 0.9994872, COVERAGE_USER_CORRECT: 0.9309158, DIVERSITY_GINI: 0.0015185, SHANNON_ENTROPY: 5.0768473, RATIO_DIVERSITY_HERFINDAHL: 0.9651289, RATIO_DIVERSITY_GINI: 0.0061224, RATIO_SHANNON_ENTROPY: 0.4097341, RATIO_AVERAGE_POPULARITY: 3.6473926, RATIO_NOVELTY: 0.0

In [16]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['time_on_test_total', 'time_on_validation_total', 'result_on_validation_best', 'metric_to_optimize', 'result_on_last', 'algorithm_name_search', 'time_df', 'result_on_validation_df', 'time_on_test_avg', 'result_on_test_best', 'time_on_validation_avg', 'exception_list', 'hyperparameters_df', 'result_on_test_df', 'hyperparameters_best', 'cutoff_to_optimize', 'hyperparameters_best_index', 'time_on_train_avg', 'time_on_train_total', 'time_on_last_df', 'algorithm_name_recommender'])

In [17]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,alpha,topK
0,0.521008,483
1,0.191127,708
2,0.718658,179
3,0.650604,649
4,0.086896,250
...,...,...
195,0.792541,82
196,0.213892,660
197,0.813899,117
198,0.871302,574


In [18]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_ITEM_CORRECT,COVERAGE_USER,COVERAGE_USER_CORRECT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.311119,0.312346,0.051423,0.187981,0.188713,0.603018,0.335875,0.088259,0.931393,1.046332,...,0.006977,0.999487,0.930916,0.001518,5.076847,0.965129,0.006122,0.409734,3.647393,0.024306
1,10,0.301114,0.302331,0.049371,0.181928,0.182683,0.59872,0.32726,0.084833,0.921205,1.025146,...,0.003876,0.999487,0.920733,0.001391,4.91458,0.962672,0.005608,0.396638,3.677415,0.024262
2,10,0.334318,0.335627,0.056635,0.203577,0.204296,0.619774,0.357378,0.096861,0.94532,1.104522,...,0.013567,0.999487,0.944835,0.002041,5.556926,0.972306,0.008229,0.44848,3.537932,0.024461
3,10,0.315546,0.316746,0.052149,0.190698,0.191416,0.606892,0.339753,0.089506,0.933812,1.05654,...,0.007863,0.999487,0.933333,0.001579,5.152205,0.966364,0.006367,0.415816,3.63516,0.024322
4,10,0.30129,0.302549,0.049679,0.181524,0.182272,0.596536,0.326939,0.085295,0.923037,1.022772,...,0.004762,0.999487,0.922564,0.001421,4.946089,0.962983,0.00573,0.399181,3.668801,0.024275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,10,0.34584,0.347187,0.059197,0.211406,0.212058,0.622402,0.367238,0.10109,0.950304,1.128486,...,0.014619,0.999487,0.949817,0.002444,5.844865,0.97664,0.009856,0.471718,3.457342,0.024571
196,10,0.301415,0.302635,0.049463,0.181959,0.182709,0.597553,0.327327,0.08498,0.921132,1.024661,...,0.004098,0.999487,0.920659,0.001396,4.92099,0.962743,0.005627,0.397155,3.676309,0.024264
197,10,0.345745,0.347116,0.059103,0.210805,0.211541,0.620517,0.366523,0.100949,0.948398,1.124482,...,0.015892,0.999487,0.947912,0.002441,5.829151,0.97611,0.009841,0.47045,3.462068,0.024569
198,10,0.332815,0.333959,0.05542,0.200565,0.201115,0.606777,0.353101,0.095018,0.943781,1.084973,...,0.013179,0.999487,0.943297,0.002196,5.681569,0.974785,0.008855,0.458539,3.470918,0.024534


In [19]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'alpha': 0.7973878520528982, 'topK': 73}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [20]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = P3alphaRecommender(URM_all)
recommender.fit()
#evaluator_valid.evaluateRecommender(recommender)

In [21]:
recommender.save_model(output_folder_path, file_name = recommender.RECOMMENDER_NAME + "_my_own_save.zip" )

P3alphaRecommender: Saving model in file 'result_experiments/P3alphaRecommender_my_own_save.zip'
P3alphaRecommender: Saving complete


# Create final recommendations

In [22]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [23]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [24]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)