In [1]:
!cp -r ../input/d/romanofrancesco/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train_1, URM_valid_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
URM_train_2, URM_valid_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
URM_train_3, URM_valid_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [8]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid_1 = EvaluatorHoldout(URM_valid_1, cutoff_list=[10])
evaluator_valid_2 = EvaluatorHoldout(URM_valid_2, cutoff_list=[10])
evaluator_valid_3 = EvaluatorHoldout(URM_valid_3, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13644 ( 0.0%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 13642 ( 0.1%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 13643 ( 0.1%) Users that have less than 1 test interactions


In [9]:
import numpy as np
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# os.environ["PYTHONWARNINGS"] = ('ignore::exceptions.ConvergenceWarning:sklearn.linear_model')
# os.environ["PYTHONWARNINGS"] = ('ignore:Objective did not converge:ConvergenceWarning:')

class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available
    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.
        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)




In [10]:
old_best_1 = SLIMElasticNetRecommender(URM_train_1)
new_best_1 = SLIMElasticNetRecommender(URM_train_1)
second_new_1 = SLIMElasticNetRecommender(URM_train_1)
third_new_1 = SLIMElasticNetRecommender(URM_train_1)

old_best_1.fit(l1_ratio = 0.001404017101088145, alpha = 0.06305313484275951, positive_only = True, topK = 2594)
#new_best_1.fit(l1_ratio = 0.001906305898268216, alpha = 0.03398463666329234, topK = 2491)
second_new_1.fit(l1_ratio = 0.002423126519631184, alpha = 0.04666719634930648, topK = 2289)
#third_new_1.fit(l1_ratio = 0.0005542281941680019, alpha = 0.06726797924429088, topK = 2957)


print(evaluator_valid_1.evaluateRecommender(old_best_1))
#print(evaluator_valid_1.evaluateRecommender(new_best_1))
print(evaluator_valid_1.evaluateRecommender(second_new_1))
#print(evaluator_valid_1.evaluateRecommender(third_new_1))

SLIMElasticNetRecommender: Processed 1451 ( 8.0%) in 5.00 min. Items per second: 4.83
SLIMElasticNetRecommender: Processed 2896 (16.0%) in 10.01 min. Items per second: 4.82
SLIMElasticNetRecommender: Processed 4355 (24.1%) in 15.01 min. Items per second: 4.84
SLIMElasticNetRecommender: Processed 5806 (32.2%) in 20.01 min. Items per second: 4.84
SLIMElasticNetRecommender: Processed 7277 (40.3%) in 25.01 min. Items per second: 4.85
SLIMElasticNetRecommender: Processed 8710 (48.2%) in 30.01 min. Items per second: 4.84
SLIMElasticNetRecommender: Processed 10086 (55.9%) in 35.01 min. Items per second: 4.80
SLIMElasticNetRecommender: Processed 11495 (63.7%) in 40.02 min. Items per second: 4.79
SLIMElasticNetRecommender: Processed 12914 (71.5%) in 45.02 min. Items per second: 4.78
SLIMElasticNetRecommender: Processed 14340 (79.4%) in 50.02 min. Items per second: 4.78
SLIMElasticNetRecommender: Processed 15787 (87.4%) in 55.02 min. Items per second: 4.78
SLIMElasticNetRecommender: Processed 17

In [11]:
old_best_2 = SLIMElasticNetRecommender(URM_train_2)
new_best_2 = SLIMElasticNetRecommender(URM_train_2)
second_new_2 = SLIMElasticNetRecommender(URM_train_2)
third_new_2 = SLIMElasticNetRecommender(URM_train_2)

old_best_2.fit(l1_ratio = 0.001404017101088145, alpha = 0.06305313484275951, positive_only = True, topK = 2594)
#new_best_2.fit(l1_ratio = 0.001906305898268216, alpha = 0.03398463666329234, topK = 2491)
second_new_2.fit(l1_ratio = 0.002423126519631184, alpha = 0.04666719634930648, topK = 2289)
#third_new_2.fit(l1_ratio = 0.0005542281941680019, alpha = 0.06726797924429088, topK = 2957)


print(evaluator_valid_2.evaluateRecommender(old_best_2))
#print(evaluator_valid_2.evaluateRecommender(new_best_2))
print(evaluator_valid_2.evaluateRecommender(second_new_2))
#print(evaluator_valid_2.evaluateRecommender(third_new_2))

SLIMElasticNetRecommender: Processed 1412 ( 7.8%) in 5.00 min. Items per second: 4.70
SLIMElasticNetRecommender: Processed 2859 (15.8%) in 10.00 min. Items per second: 4.76
SLIMElasticNetRecommender: Processed 4314 (23.9%) in 15.00 min. Items per second: 4.79
SLIMElasticNetRecommender: Processed 5742 (31.8%) in 20.01 min. Items per second: 4.78
SLIMElasticNetRecommender: Processed 7186 (39.8%) in 25.01 min. Items per second: 4.79
SLIMElasticNetRecommender: Processed 8635 (47.8%) in 30.01 min. Items per second: 4.79
SLIMElasticNetRecommender: Processed 10076 (55.8%) in 35.01 min. Items per second: 4.80
SLIMElasticNetRecommender: Processed 11514 (63.8%) in 40.01 min. Items per second: 4.80
SLIMElasticNetRecommender: Processed 12998 (72.0%) in 45.02 min. Items per second: 4.81
SLIMElasticNetRecommender: Processed 14484 (80.2%) in 50.02 min. Items per second: 4.83
SLIMElasticNetRecommender: Processed 15951 (88.3%) in 55.02 min. Items per second: 4.83
SLIMElasticNetRecommender: Processed 17

In [12]:
old_best_3 = SLIMElasticNetRecommender(URM_train_3)
new_best_3 = SLIMElasticNetRecommender(URM_train_3)
second_new_3 = SLIMElasticNetRecommender(URM_train_3)

old_best_3.fit(l1_ratio = 0.001404017101088145, alpha = 0.06305313484275951, positive_only = True, topK = 2594)
#new_best_3.fit(l1_ratio = 0.001650183611841036, alpha = 0.04590266290018709, topK = 3853)
second_new_3.fit(l1_ratio = 0.002423126519631184, alpha = 0.04666719634930648, topK = 2289)

print(evaluator_valid_3.evaluateRecommender(old_best_3))
#print(evaluator_valid_3.evaluateRecommender(new_best_3))
print(evaluator_valid_3.evaluateRecommender(second_new_3))

SLIMElasticNetRecommender: Processed 1389 ( 7.7%) in 5.00 min. Items per second: 4.63
SLIMElasticNetRecommender: Processed 2802 (15.5%) in 10.00 min. Items per second: 4.67
SLIMElasticNetRecommender: Processed 4248 (23.5%) in 15.00 min. Items per second: 4.72
SLIMElasticNetRecommender: Processed 5702 (31.6%) in 20.01 min. Items per second: 4.75
SLIMElasticNetRecommender: Processed 7153 (39.6%) in 25.01 min. Items per second: 4.77
SLIMElasticNetRecommender: Processed 8528 (47.2%) in 30.01 min. Items per second: 4.74
SLIMElasticNetRecommender: Processed 9864 (54.6%) in 35.01 min. Items per second: 4.70
SLIMElasticNetRecommender: Processed 11272 (62.4%) in 40.01 min. Items per second: 4.69
SLIMElasticNetRecommender: Processed 12694 (70.3%) in 45.02 min. Items per second: 4.70
SLIMElasticNetRecommender: Processed 14074 (77.9%) in 50.02 min. Items per second: 4.69
SLIMElasticNetRecommender: Processed 15476 (85.7%) in 55.02 min. Items per second: 4.69
SLIMElasticNetRecommender: Processed 169

new_best tuning seems better over 3 different splits

# Create final recommendations

In [13]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = SLIMElasticNetRecommender(URM_all)
recommender.fit(l1_ratio = 0.001650183611841036, alpha = 0.04590266290018709, topK = 3853)
#evaluator_valid.evaluateRecommender(recommender)

SLIMElasticNetRecommender: Processed 1103 ( 6.1%) in 5.00 min. Items per second: 3.67
SLIMElasticNetRecommender: Processed 2266 (12.5%) in 10.00 min. Items per second: 3.77
SLIMElasticNetRecommender: Processed 3431 (19.0%) in 15.00 min. Items per second: 3.81
SLIMElasticNetRecommender: Processed 4588 (25.4%) in 20.01 min. Items per second: 3.82
SLIMElasticNetRecommender: Processed 5737 (31.8%) in 25.01 min. Items per second: 3.82
SLIMElasticNetRecommender: Processed 6910 (38.3%) in 30.01 min. Items per second: 3.84
SLIMElasticNetRecommender: Processed 8065 (44.7%) in 35.02 min. Items per second: 3.84
SLIMElasticNetRecommender: Processed 9188 (50.9%) in 40.02 min. Items per second: 3.83
SLIMElasticNetRecommender: Processed 10286 (57.0%) in 45.02 min. Items per second: 3.81
SLIMElasticNetRecommender: Processed 11423 (63.3%) in 50.02 min. Items per second: 3.81
SLIMElasticNetRecommender: Processed 12547 (69.5%) in 55.02 min. Items per second: 3.80
SLIMElasticNetRecommender: Processed 1363

In [14]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [15]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [16]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)