In [2]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [3]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [4]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [5]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [6]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [7]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

# Data processing and basic tuning setup

In [8]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [9]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13645 ( 0.0%) Users that have less than 1 test interactions


In [10]:
from Recommenders.MatrixFactorization.IALSRecommender import IALSRecommender

#try a MF IALS model
recommender_class = IALSRecommender

In [None]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 20
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [None]:
from skopt.space import Real, Integer, Categorical
#MF IALS is machine learning-based matrix factorization technique
#the tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "num_factors": Integer(20, 70),
    "confidence_scaling": Categorical(["linear", "log"]),
    "alpha": Real(low=1e-2, high=1.0, prior= "log-uniform"),
    "epsilon": Real(low=1e-1, high=10.0, prior= "log-uniform"),
    "reg": Real(low = 1e-5, high = 1e-3, prior = 'log-uniform')
}

In [None]:
#We also setup the early stopping 
earlystopping_keywargs = {"validation_every_n": 15,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_valid,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

In [None]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [None]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [None]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [None]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

# Fit the model with the best parameters found

In [11]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = IALSRecommender(URM_all)
recommender.fit(num_factors = 48,
 confidence_scaling= 'linear',
 alpha = 1.0,
 epsilon = 0.10715471885641545,
 reg = 0.0005294631576919714,
 epochs = 30)

IALSRecommender: Epoch 1 of 30. Elapsed time 16.54 sec
IALSRecommender: Epoch 2 of 30. Elapsed time 31.88 sec
IALSRecommender: Epoch 3 of 30. Elapsed time 50.92 sec
IALSRecommender: Epoch 4 of 30. Elapsed time 1.15 min
IALSRecommender: Epoch 5 of 30. Elapsed time 1.42 min
IALSRecommender: Epoch 6 of 30. Elapsed time 1.71 min
IALSRecommender: Epoch 7 of 30. Elapsed time 1.97 min
IALSRecommender: Epoch 8 of 30. Elapsed time 2.27 min
IALSRecommender: Epoch 9 of 30. Elapsed time 2.55 min
IALSRecommender: Epoch 10 of 30. Elapsed time 2.84 min
IALSRecommender: Epoch 11 of 30. Elapsed time 3.10 min
IALSRecommender: Epoch 12 of 30. Elapsed time 3.40 min
IALSRecommender: Epoch 13 of 30. Elapsed time 3.66 min
IALSRecommender: Epoch 14 of 30. Elapsed time 3.94 min
IALSRecommender: Epoch 15 of 30. Elapsed time 4.22 min
IALSRecommender: Epoch 16 of 30. Elapsed time 4.53 min
IALSRecommender: Epoch 17 of 30. Elapsed time 4.82 min
IALSRecommender: Epoch 18 of 30. Elapsed time 5.12 min
IALSRecommender:

In [12]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [13]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [14]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)