In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
import utils
 
BEST_ALFA = 0.9

In [2]:
class New_Splitted_Ensemble(object):
    
    def fit(self, URM_csr, URM_train, ICM_csr, alfa):
        transformer = TfidfTransformer()
        transformer.fit(URM_train)
        tf_idf_csr = transformer.transform(URM_csr)

        IRM = sparse.csr_matrix(tf_idf_csr.transpose())
        
        csr_similarities = sparse.csr_matrix(cosine_similarity(IRM, dense_output=False))
        

        transformer.fit(ICM_csr)
        tf_idf_icm = transformer.transform(ICM_csr)
        icm_similarities = sparse.csr_matrix(cosine_similarity(tf_idf_icm, dense_output=False))
        
        print("COMPUTING ENSEMBLE SIMILARITIES")
        self.item_similarities = alfa*csr_similarities + (1-alfa)*icm_similarities        
        self.URM_csr = URM_csr
        
    
    def recommend(self, user_id, at=10, remove_seen=True):
        
        user = self.URM_csr.getrow(user_id)
        itemPopularity = user.dot(self.item_similarities)
        popularItems = np.argsort(np.array(itemPopularity.todense())[0])
        popularItems = np.flip(popularItems, axis = 0)

        if remove_seen:
            unseen_items_mask = np.in1d(popularItems, self.URM_csr[user_id].indices,
                                        assume_unique=True, invert = True)

            unseen_items = popularItems[unseen_items_mask]
            
            recommended_items = unseen_items[0:at]

        else:
            recommended_items = popularItems[0:at]
            
        recommended_items = " ".join(str(i) for i in recommended_items)
        return recommended_items

In [3]:
tracks = pd.read_csv('../input/tracks.csv')
train = pd.read_csv('../input/train.csv')
target = pd.read_csv('../input/target_playlists.csv')

In [4]:
training_set_ns = utils.build_urm_csr(train)
icm_csr = utils.build_icm_csr(tracks)



In [5]:
target_playlists = np.genfromtxt('../input/target_playlists.csv', delimiter = ',', dtype=int)[1:]

In [6]:
occurrencies = training_set_ns.getnnz(axis = 1)

In [7]:
mask1 = np.where(occurrencies < 15)
mask2 = np.where((occurrencies >= 15) & (occurrencies < 30))
mask3 = np.where(occurrencies >= 30)

In [8]:
below_train = sparse.csr_matrix(training_set_ns.shape, dtype = np.float32)
middle_train = sparse.csr_matrix(training_set_ns.shape, dtype = np.float32)
above_train = sparse.csr_matrix(training_set_ns.shape, dtype = np.float32)

In [9]:
below_train = training_set_ns[mask1]
middle_train = training_set_ns[mask2]
above_train = training_set_ns[mask3]

In [10]:
ensemble1 = New_Splitted_Ensemble()
ensemble2 = New_Splitted_Ensemble()
ensemble3 = New_Splitted_Ensemble()

print("FITTING...")
ensemble1.fit(training_set_ns, below_train, icm_csr, alfa = BEST_ALFA)
ensemble2.fit(training_set_ns, middle_train, icm_csr, alfa = BEST_ALFA)
ensemble3.fit(training_set_ns, above_train, icm_csr, alfa = BEST_ALFA)

result = []
    
print("RECOMMENDING...")
for elem in target_playlists:
    if(elem in mask1[0]):
        recommendation = ensemble1.recommend(elem)
    elif(elem in mask2[0]):
        recommendation = ensemble2.recommend(elem)
    elif(elem in mask3[0]):
        recommendation = ensemble3.recommend(elem)
    else:
        print("ERROR: CAN'T FIND PLAYLIST NUMBER " + str(elem) + " IN ANY OF THE 3 SETS")
    temp = [elem,recommendation]
    result.append(temp)
    
print("SAVING RESULT TO 'sample_submission.csv'")
rec = pd.DataFrame(result)
rec.to_csv("sample_submission.csv", index = False, header = ["playlist_id", "track_ids"])

FITTING...
COMPUTING ENSEMBLE SIMILARITIES
COMPUTING ENSEMBLE SIMILARITIES
COMPUTING ENSEMBLE SIMILARITIES
RECOMMENDING...
SAVING RESULT TO 'sample_submission.csv'
