In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

# Data processing and basic tuning setup

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [8]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13646 ( 0.0%) Users that have less than 1 test interactions


In [9]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender

#try a SLIM BPR model
recommender_class = RP3betaRecommender

In [10]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [11]:
from skopt.space import Real, Integer, Categorical
#RP3 beta
#to tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "alpha": Real(low=0, high=1, prior='uniform'),
    "beta": Real(low=0, high=1, prior='uniform'),
    "topK": Integer(1, 800),
    "implicit": Categorical([True, False])
}

In [12]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [13]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [14]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [15]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'alpha': 0.07760028622506233, 'beta': 0.7069366387954393, 'topK': 5, 'implicit': True}
EvaluatorHoldout: Processed 13646 (100.0%) in 12.96 sec. Users per second: 1053
SearchBayesianSkopt: New best config found. Config 0: {'alpha': 0.07760028622506233, 'beta': 0.7069366387954393, 'topK': 5, 'implicit': True} - results: PRECISION: 0.2909277, PRECISION_RECALL_MIN_DEN: 0.2919922, RECALL: 0.0513795, MAP: 0.1587552, MAP_MIN_DEN: 0.1592560, MRR: 0.5420087, NDCG: 0.3035347, F1: 0.0873351, HIT_RATE: 0.9366115, ARHR_ALL_HITS: 0.9191331, NOVELTY: 0.0061853, AVERAGE_POPULARITY: 0.3392088, DIVERSITY_MEAN_INTER_LIST: 0.9716486, DIVERSITY_HERFINDAHL: 0.9971577, COVERAGE_ITEM: 0.2783100, COVERAGE_ITEM_CORRECT: 0.1256437, COVERAGE_USER: 0.9997070, COVERAGE_USER_CORRECT: 0.9363370, DIVERSITY_GINI: 0.0444406, SHANNON_ENTROPY: 9.7270648, RATIO_DIVERSITY_HERFINDAHL: 0.9975423, RATIO_DIVERSITY_GINI: 0.1791504

In [16]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['result_on_test_best', 'hyperparameters_df', 'time_on_train_total', 'time_on_train_avg', 'result_on_validation_best', 'algorithm_name_search', 'time_on_test_total', 'result_on_validation_df', 'result_on_test_df', 'metric_to_optimize', 'hyperparameters_best', 'time_on_validation_total', 'time_on_last_df', 'time_on_test_avg', 'algorithm_name_recommender', 'result_on_last', 'time_df', 'time_on_validation_avg', 'exception_list', 'hyperparameters_best_index', 'cutoff_to_optimize'])

In [17]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,alpha,beta,topK,implicit
0,0.0776,0.706937,5,True
1,0.769261,0.364859,40,True
2,0.373423,0.855044,410,False
3,0.930559,0.092149,724,False
4,0.439472,0.536118,285,False
5,0.704152,0.462163,386,False
6,0.357213,0.069901,451,True
7,0.98231,0.395649,57,False
8,0.348092,0.014607,261,True
9,0.965961,0.53767,277,False


In [18]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_ITEM_CORRECT,COVERAGE_USER,COVERAGE_USER_CORRECT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.290928,0.291992,0.051379,0.158755,0.159256,0.542009,0.303535,0.087335,0.936611,0.919133,...,0.125644,0.999707,0.936337,0.044441,9.727065,0.997542,0.17915,0.785014,1.65567,0.0291
1,10,0.352931,0.354113,0.061477,0.213518,0.214085,0.6221,0.371551,0.104714,0.956178,1.134433,...,0.092862,0.999707,0.955897,0.026248,8.882861,0.994045,0.10581,0.716884,1.8252,0.028781
2,10,0.249231,0.249895,0.040383,0.130715,0.130974,0.46213,0.256139,0.069505,0.871464,0.766488,...,0.085719,0.999707,0.871209,0.018888,8.184685,0.992523,0.076141,0.660538,1.886089,0.028325
3,10,0.318247,0.319374,0.054717,0.186503,0.187004,0.583874,0.335661,0.093379,0.936685,1.026684,...,0.048563,0.999707,0.93641,0.012201,8.162744,0.994641,0.049184,0.658767,1.804965,0.027824
4,10,0.354375,0.355557,0.060448,0.213672,0.214235,0.612805,0.370558,0.103279,0.955225,1.124844,...,0.055263,0.999707,0.954945,0.007753,7.05217,0.984393,0.031254,0.569139,2.890336,0.025954
5,10,0.353305,0.354546,0.060528,0.214351,0.214959,0.618854,0.371432,0.10335,0.955518,1.132479,...,0.042804,0.999707,0.955238,0.006094,6.807236,0.982961,0.024567,0.549372,2.982838,0.025745
6,10,0.2791,0.279809,0.046542,0.161515,0.161819,0.552124,0.299446,0.07978,0.904587,0.930257,...,0.064511,0.999707,0.904322,0.018018,8.710861,0.996344,0.072633,0.703003,1.168149,0.029378
7,10,0.364392,0.365671,0.063409,0.223114,0.223699,0.630867,0.382958,0.108021,0.961014,1.167209,...,0.083559,0.999707,0.960733,0.021887,8.518529,0.991844,0.088233,0.687481,2.096408,0.028148
8,10,0.250528,0.250957,0.040808,0.143293,0.143503,0.522528,0.271633,0.070184,0.869486,0.85195,...,0.065784,0.999707,0.869231,0.020344,8.860367,0.996532,0.082012,0.715068,0.908201,0.030208
9,10,0.359255,0.360474,0.061587,0.217932,0.218488,0.62085,0.376099,0.105148,0.956617,1.143245,...,0.051996,0.999707,0.956337,0.008184,7.083586,0.984522,0.032993,0.571675,2.888713,0.026044


In [19]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'alpha': 0.9902904484569759,
 'beta': 0.5159273869398134,
 'topK': 61,
 'implicit': False}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [20]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = RP3betaRecommender(URM_all)
recommender.fit()
#evaluator_valid.evaluateRecommender(recommender)

In [21]:
#recommender.save_model(output_folder_path, file_name = recommender.RECOMMENDER_NAME + "_my_own_save.zip" )

# Create final recommendations

In [22]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [23]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [24]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)