In [4]:
from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MaxAbsScaler
import gc
import utils
from GraphBased.P3alphaRecommender import P3alphaRecommender
from CollaborativeRecommender import CollaborativeItemBasedRecommender
from Content_based import Content_based_recommender

TEST_SET_THRESHOLD = 10
TEST_SET_HOLDOUT = 0.35
BEST_ALPHA = 0.55
BEST_BETA = 0.86
BEST_GAMMA = 0.35
BEST_BATCH = 10
BEST_K = 50

In [15]:
tracks = pd.read_csv('../input/tracks.csv')
train = pd.read_csv('../input/train.csv')
target = pd.read_csv('../input/target_playlists.csv')

In [6]:
icm_csr = utils.build_icm_csr(tracks)
urm_csr = utils.build_urm_csr(train)



In [4]:
from utils import split

URM_train, URM_test = split(urm_csr,TEST_SET_HOLDOUT = TEST_SET_HOLDOUT,TEST_SET_THRESHOLD = TEST_SET_THRESHOLD)
URM_train, URM_validation = split(URM_train,TEST_SET_HOLDOUT = TEST_SET_HOLDOUT,TEST_SET_THRESHOLD = TEST_SET_THRESHOLD)

In [7]:
nnz_per_row = urm_csr.getnnz(axis=1)

In [8]:
mask_low_15 = np.where(nnz_per_row < 15)[0]
mask_low_30 = np.where((nnz_per_row < 30) & (nnz_per_row >= 15))[0]
mask_high = np.where(nnz_per_row >= 30)[0]

In [9]:
data_under_15 = urm_csr[mask_low_15]
data_under_30 = urm_csr[mask_low_30]
data_high = urm_csr[mask_high]

In [15]:
data_under_15

<18072x20635 sparse matrix of type '<class 'numpy.float32'>'
	with 167893 stored elements in Compressed Sparse Row format>

In [9]:
data_under_30

<17046x20635 sparse matrix of type '<class 'numpy.float32'>'
	with 362712 stored elements in Compressed Sparse Row format>

In [10]:
data_high

<15328x20635 sparse matrix of type '<class 'numpy.float32'>'
	with 681186 stored elements in Compressed Sparse Row format>

In [10]:
class EnsembleRecommender(object):
    
    
    def fit(self,URM_csr,slim_recommender,W_P3_recommender, CF_recommender, CB_recommender, alpha, beta, gamma):
        
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.slim_recommender = slim_recommender
        self.W_P3_recommender = W_P3_recommender
        self.CF_recommender = CF_recommender
        self.CB_recommender = CB_recommender
        self.URM_csr = URM_csr
        
    def recommend(self, user_id, at=10, remove_seen_flag=True):
        slim_recomendation = self.slim_recommender.compute_item_score(user_id)
        W_P3_recommendation = self.W_P3_recommender.compute_item_score(user_id)
        CF_recomandation = self.CF_recommender.compute_item_score(user_id)
        CB_recomandation = self.CB_recommender.compute_item_score(user_id)
        CB_CF_rec = self.alpha *CB_recomandation + (1- self.alpha) * CF_recomandation
        slim_CBCF_rec = self.beta * slim_recomendation + (1-self.beta) * CB_CF_rec
        WP3_others_rec = self.gamma * W_P3_recommendation + (1 - self.gamma) *  slim_CBCF_rec
        popularItems = np.argsort(np.array(WP3_others_rec)[0])
        popularItems = np.flip(popularItems, axis = 0)

        if remove_seen_flag:
            unseen_items_mask = np.in1d(popularItems, self.URM_csr[user_id].indices,
                                        assume_unique=True, invert = True)

            unseen_items = popularItems[unseen_items_mask]
            
            recommended_items = unseen_items[0:at]

        else:
            recommended_items = popularItems[0:at]
            
        #recommended_items = " ".join(str(i) for i in recommended_items)
        return recommended_items
    
    

In [11]:
slim_recommender = SLIM_BPR_Cython(urm_csr,recompile_cython=False,positive_threshold=1) 

SLIM_BPR_Cython: Estimated memory required for similarity matrix of 20635 items is 1703.21 MB


In [12]:
slim_recommender.fit(epochs=100,sgd_mode='adam',batch_size=BEST_BATCH,topK=BEST_K)

Processed 1211791 ( 100.00% ) in 4.72 seconds. BPR loss is 3.16E-02. Sample per second: 256890
SLIM_BPR_Recommender: Epoch 1 of 100. Elapsed time 0.14 min
Processed 1211791 ( 100.00% ) in 5.41 seconds. BPR loss is 1.16E-01. Sample per second: 223882
SLIM_BPR_Recommender: Epoch 2 of 100. Elapsed time 0.22 min
Processed 1211791 ( 100.00% ) in 5.17 seconds. BPR loss is 2.13E-01. Sample per second: 234568
SLIM_BPR_Recommender: Epoch 3 of 100. Elapsed time 0.30 min
Processed 1211791 ( 100.00% ) in 4.92 seconds. BPR loss is 3.21E-01. Sample per second: 246270
SLIM_BPR_Recommender: Epoch 4 of 100. Elapsed time 0.37 min
Processed 1211791 ( 100.00% ) in 5.84 seconds. BPR loss is 4.43E-01. Sample per second: 207538
SLIM_BPR_Recommender: Epoch 5 of 100. Elapsed time 0.46 min
Processed 1211791 ( 100.00% ) in 5.53 seconds. BPR loss is 5.74E-01. Sample per second: 219026
SLIM_BPR_Recommender: Epoch 6 of 100. Elapsed time 0.53 min
Processed 1211791 ( 100.00% ) in 5.20 seconds. BPR loss is 7.15E-01. S

In [18]:
nnz_in_validation = URM_validation.getnnz(axis=1) 
nnz_indexes = np.where(nnz_in_validation > 0)[0]

In [16]:
from operator import itemgetter

target = target.get_values()[:,0]
users_listened = []
for item in target:
    nnz = len(urm_csr.getrow(item).data)
    users_listened.append([item,nnz])
sorted_list = sorted(users_listened,key=itemgetter(1))
target_low = []
target_middle = []
target_high = []

for item in sorted_list:
    if item[1] < 15:
        target_low.append(item[0])
    elif item[1] < 30:
        target_middle.append(item[0])
    else:
        target_high.append(item[0])

In [28]:
tf_id_flag = True

if tf_id_flag:
    transformer = TfidfTransformer()
    transformer.fit(urm_csr)
    URM_train = transformer.transform(urm_csr)
    gc.collect()
    transformer = TfidfTransformer()
    transformer.fit(icm_csr)
    tf_idf_csr = transformer.transform(icm_csr)
    icm_csr = sparse.csr_matrix(tf_idf_csr)
    gc.collect()
icm_transpose = sparse.csr_matrix(icm_csr.transpose())
shrink = 5
topK = 250
CF_recommender = CollaborativeItemBasedRecommender()
CF_recommender.fit(urm_csr,block_size=1000,topK=topK,shrink=shrink)
gc.collect()
shrink2 = 5
topK2 = 10
CB_recommender = Content_based_recommender(URM_csr=urm_csr,shrink=shrink2,topK=topK2,dataMatrix=icm_transpose)
CB_recommender.fit()
gc.collect()
alpha_wp3 = 0.8
P3alpha = P3alphaRecommender(urm_csr)
P3alpha.fit(alpha=alpha_wp3)
gc.collect()
alpha = 0.45
beta = 0.001
gamma = 0.45

ensamble_recommender = EnsembleRecommender()
ensamble_recommender.fit(urm_csr,slim_recommender,P3alpha,CF_recommender,CB_recommender,alpha,beta,gamma)
                                    

Similarity column 20000 ( 97 % ), 1047.79 column/sec, elapsed time 0.32 min
Similarity column 20600 ( 100 % ), 3007.49 column/sec, elapsed time 0.11 min


In [18]:
result_low = []
for items in target_low:
    partial_recomendations = ensamble_recommender.recommend(items,at=10)
    result_low.append(partial_recomendations)

In [19]:
result_low = np.array(result_low)

In [20]:
result_low = result_low.reshape(-1,10)

In [21]:
i = 0
results = []
for item in result_low:
    recommended_items = " ".join(str(i) for i in item)
    temp = [target_low[i],recommended_items]
    results.append(temp)
    i += 1

In [24]:
result_middle = []
for items in target_middle:
    partial_recomendations = ensamble_recommender.recommend(items,at=10)
    result_middle.append(partial_recomendations)

In [25]:
result_middle = np.array(result_middle)

In [26]:
result_middle = result_middle.reshape(-1,10)

In [27]:
i = 0
for item in result_middle:
    recommended_items = " ".join(str(i) for i in item)
    temp = [target_middle[i],recommended_items]
    results.append(temp)
    i += 1

In [29]:
result_high = []
for items in target_high:
    partial_recomendations = ensamble_recommender.recommend(items,at=10)
    result_high.append(partial_recomendations)

In [30]:
result_high = np.array(result_high)

In [31]:
result_high = result_high.reshape(-1,10)

In [32]:
i = 0
for item in result_high:
    recommended_items = " ".join(str(i) for i in item)
    temp = [target_high[i],recommended_items]
    results.append(temp)
    i += 1

In [33]:
len(results)

10000

In [42]:
sorted(target_low)[0]

3

In [34]:
rec = pd.DataFrame(results)
rec.to_csv("ensemble_tuned.csv", index = False, header = ["playlist_id", "track_ids"])

In [20]:
len(nnz_indexes)

11784

In [21]:
targets_low_15 = np.intersect1d(nnz_indexes,mask_low_15,assume_unique=False)

In [22]:
targets_low_30 = np.intersect1d(nnz_indexes,mask_low_30,assume_unique=False)

In [23]:
targets_high = np.intersect1d(nnz_indexes,mask_high,assume_unique=False)

In [24]:
len(targets_high) 

5324

In [56]:
quarters = [0,0.1,0.01,0.001,0.25,0.5,0.75,0.9,1]
alphas_wp3 = [0.8]
integers = [1,2,10,50,250,500,1000,20000]
flags = [True]
MAPS_low = []
MAPS_middle = []
MAPS_high = []

for tf_id_flag in flags:
    if tf_id_flag:
        transformer = TfidfTransformer()
        transformer.fit(URM_train)
        URM_train = transformer.transform(URM_train)
        gc.collect()
        transformer = TfidfTransformer()
        transformer.fit(icm_csr)
        tf_idf_csr = transformer.transform(icm_csr)
        icm_csr = sparse.csr_matrix(tf_idf_csr)
        gc.collect()
    icm_transpose = sparse.csr_matrix(icm_csr.transpose())
    for shrink in [5,10]:
        for topK in [1000,1500]:
            CF_recommender = CollaborativeItemBasedRecommender()
            CF_recommender.fit(URM_train,block_size=1000,topK=topK,shrink=shrink)
            gc.collect()
            for shrink2 in [5,400,500,550]:
                for topK2 in [750]:
                    CB_recommender = Content_based_recommender(URM_csr=URM_train,shrink=shrink2,topK=topK2,dataMatrix=icm_transpose)
                    CB_recommender.fit()
                    gc.collect()
                    for alpha_wp3 in alphas_wp3:
                        P3alpha = P3alphaRecommender(URM_train)
                        P3alpha.fit(alpha=alpha_wp3)
                        gc.collect()
                        for alpha in [0.25,0.50]:
                            for beta in quarters:
                                for gamma in quarters:
                                    print("Testing with {} tf_idf, shrink1 = {},topK1 = {}, shrink2 ={}, topk2={},alpha_wp3={},alpha = {},beta = {},gamma ={}"
                                          .format(tf_id_flag,shrink,topK,shrink2,topK2,alpha_wp3,alpha,beta,gamma))
                                    ensamble_recommender = EnsembleRecommender()
                                    ensamble_recommender.fit(URM_train,slim_recommender,P3alpha,CF_recommender,CB_recommender,alpha,beta,gamma)
                                    gc.collect()
                                    #map1 = utils.evaluate_algorithm(URM_validation,at=10,recommender_object=ensamble_recommender,target_playlists=targets_low_15)
                                    #print("low map is ", map1)
                                    #MAPS_low.append([tf_id_flag,shrink,topK,shrink2,topK2,alpha_wp3,alpha,beta,gamma,map1])
                                    map2 = utils.evaluate_algorithm(URM_validation,at=10,recommender_object=ensamble_recommender,target_playlists=targets_low_30)
                                    print("middle map is ", map2)
                                    MAPS_middle.append([tf_id_flag,shrink,topK,shrink2,topK2,alpha_wp3,alpha,beta,gamma,map2])
                                    """map3 = utils.evaluate_algorithm(URM_validation,at=10,recommender_object=ensamble_recommender,target_playlists=targets_high)
                                    print("high map is ", map3)
                                    MAPS_high.append([tf_id_flag,shrink,topK,shrink2,topK2,alpha_wp3,alpha,beta,gamma,map3])
                                    gc.collect()
                                    map_media = (map1*(len(targets_low_15)) + map2*len(targets_low_30)+ map3*len(targets_high))/(len(targets_low_15)+len(targets_low_30)+len(targets_high))
                                    print("Media map = ", map_media)"""

Similarity column 20000 ( 97 % ), 1007.73 column/sec, elapsed time 0.33 min
Similarity column 20600 ( 100 % ), 1864.39 column/sec, elapsed time 0.18 min
Testing with True tf_idf, shrink1 = 5,topK1 = 1000, shrink2 =5, topk2=750,alpha_wp3=0.8,alpha = 0.25,beta = 0,gamma =0
middle map is  0.0866777822464
Testing with True tf_idf, shrink1 = 5,topK1 = 1000, shrink2 =5, topk2=750,alpha_wp3=0.8,alpha = 0.25,beta = 0,gamma =0.1
middle map is  0.0858515188366
Testing with True tf_idf, shrink1 = 5,topK1 = 1000, shrink2 =5, topk2=750,alpha_wp3=0.8,alpha = 0.25,beta = 0,gamma =0.01
middle map is  0.0866382832374
Testing with True tf_idf, shrink1 = 5,topK1 = 1000, shrink2 =5, topk2=750,alpha_wp3=0.8,alpha = 0.25,beta = 0,gamma =0.001


KeyboardInterrupt: 

In [29]:
np.save("map_low2",np.array(MAPS_low))
np.save("map_middle2",np.array(MAPS_middle))
np.save("map_high2",np.array(MAPS_high))

In [32]:
from operator import itemgetter

In [48]:
Best_map_low = sorted(MAPS_low,key = itemgetter(9),reverse=True)[0:5]

In [53]:
best_map_middle = sorted(MAPS_middle,key = itemgetter(9),reverse=True)[0:5]

In [39]:
best_map_high = sorted(MAPS_high,key = itemgetter(9),reverse=True)[0:5]

In [39]:
gc.collect()

0

In [43]:
Best_map_low

[[False, 5, 550, 5, 750, 0.8, 0.5, 0.01, 0.5, 0.08722646534516075],
 [False, 5, 550, 5, 1000, 0.8, 0.5, 0.01, 0.5, 0.087221982045452173],
 [False, 5, 550, 5, 750, 0.8, 0.45, 0.01, 0.45, 0.08721357585849851],
 [False, 5, 550, 5, 1000, 0.8, 0.45, 0.01, 0.45, 0.08721357585849851],
 [False, 5, 450, 5, 1000, 0.8, 0.45, 0.01, 0.51, 0.087209225990328959]]

In [54]:
best_map_middle

[[False, 5, 1000, 5, 750, 0.8, 0.25, 0.01, 0.1, 0.10126176591690399],
 [False, 5, 1000, 5, 750, 0.8, 0.25, 0.01, 0.01, 0.10122854685141069],
 [False, 5, 1000, 5, 750, 0.8, 0.25, 0.001, 0.25, 0.10122411019505352],
 [False, 5, 1000, 5, 750, 0.8, 0.25, 0.01, 0.25, 0.10120002776900054],
 [False, 5, 1000, 5, 750, 0.8, 0.25, 0.01, 0.001, 0.10119587837816311]]

In [40]:
best_map_high

[[True, 5, 250, 5, 10, 0.8, 0.45, 0.001, 0.45, 0.094174150000596493],
 [True, 5, 250, 5, 10, 0.8, 0.45, 0.001, 0.49, 0.093999275517870809],
 [True, 5, 250, 5, 10, 0.8, 0.45, 0.001, 0.5, 0.093945803966465333],
 [True, 5, 250, 5, 10, 0.8, 0.45, 0.001, 0.51, 0.093925575113591869],
 [True, 5, 250, 5, 10, 0.8, 0.45, 0.0001, 0.45, 0.093881398101439764]]

In [49]:
Best_map_low

[[False, 5, 550, 5, 750, 0.8, 0.5, 0.1, 0.5, 0.088682870592756396],
 [False, 5, 550, 5, 800, 0.8, 0.5, 0.1, 0.5, 0.08866685880808288],
 [False, 5, 550, 5, 1000, 0.8, 0.5, 0.1, 0.5, 0.08866685880808288],
 [False, 5, 550, 5, 750, 0.8, 0.5, 0.01, 0.5, 0.08722646534516075],
 [False, 5, 550, 5, 800, 0.8, 0.5, 0.01, 0.5, 0.087221982045452173]]