In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import random
from scipy.sparse import linalg
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from FW_Similarity.CFW_D_Similarity_Linalg import CFW_D_Similarity_Linalg
from Base.Evaluation.Evaluator import SequentialEvaluator

TEST_SET_THRESHOLD = 10
TEST_SET_HOLDOUT = 0.2
BEST_ALFA = 0.92

In [2]:
tracks = pd.read_csv('../input/tracks.csv')
train = pd.read_csv('../input/train.csv')
target = pd.read_csv('../input/target_playlists.csv')

# Defining methods to create csr matrices

In [3]:
def classify_durations(data):
    data.loc[tracks['duration_sec'].isin(range(60)),'duration_sec'] = 1
    data.loc[tracks['duration_sec'].isin(range(60,120)), 'duration_sec'] = 2
    data.loc[tracks['duration_sec'].isin(range(120,180)), 'duration_sec'] = 3
    data.loc[tracks['duration_sec'].isin(range(180,240)), 'duration_sec'] = 4
    data.loc[tracks['duration_sec'].isin(range(240,300)), 'duration_sec'] = 5
    data.loc[tracks['duration_sec'].isin(range(300,200000)), 'duration_sec'] = 6

def build_urm_csr(data):
    fill_data = np.ones(data.shape[0])
    #posso usare gli id direttamente solo perchè come già detto sono consistenti
    row = data['playlist_id'].values
    col = data['track_id'].values
    n_pl = 20635
    n_tr = np.amax(data['track_id']) + 1
    
    return sparse.csr_matrix((fill_data, (row, col)), dtype=float, shape=(n_pl, n_tr))

def build_icm_csr(data):
    
    classify_durations(data)
    
    albums_id = data['album_id']
    artists_id = data['artist_id']
    tracks = data['track_id']
    
    albums_max = np.amax(albums_id)
    artists_max = np.amax(artists_id)
    number_of_songs = data.shape[0]
    
    icm_csr_matrix = sparse.csr_matrix((number_of_songs, albums_max + artists_max + 2), dtype=np.float32)
   
    icm_csr_matrix[tracks, albums_id] = 1
    icm_csr_matrix[tracks, albums_max + artists_id] = 1

    return icm_csr_matrix

# SPLITTING DATASET

In [5]:

#Raggruppiamo per playlist_id, le celle conterranno il count() del gruppo, quindi il numero di canzoni per playlist
grouped = train.groupby('playlist_id')['track_id'].nunique()

#Prendiamo le playlist che superano il numero di elementi del TEST_SET_THRESHOLD
clipped = grouped.index[grouped>TEST_SET_THRESHOLD].tolist()

#Adesso prendiamo a caso degli indici di playlist in percentuale di TEST_SET_HOLDOUT
#ATTENZIONE, la percentuale viene calcolata sulla lunghezza di clipped, che avrà un numero di elementi inferiore a train
#Questo significa che il 20% di clipped sarà circa il 14% del train, la percentuale è da aggiustare tenendo conto di sto fatto
test_set_indices = [ clipped[i] for i in sorted(random.sample(range(len(clipped)), int(TEST_SET_HOLDOUT*len(clipped)))) ]

#Andiamo a estrarre dal train TUTTE le canzoni delle playlist estratte a sorte nella riga prima
test_groups = train.loc[train['playlist_id'].isin(test_set_indices)]

#Andiamo a creare un dataframe vuoto, a cui appenderemo tutte le canzoni da ficcare nel test_set con una .append()
test_set = pd.DataFrame(columns=["playlist_id","track_id"])

#Per ogni gruppo prendiamo le ultime 10 canzoni e le appendiamo al test_set
for name, group in test_groups.groupby('playlist_id'):
    test_set = test_set.append(group.tail(10))

#Togliamo le canzoni del test set al train, salvandolo in una nuova variabile 
#Questo è solo un trick per fare la differenza insiemistica
training_set = pd.concat([train, test_set, test_set]).drop_duplicates(keep=False)


In [6]:
#Ora passiamo training_set e test_set a csr_matrix
test_set_csr = build_urm_csr(test_set)
icm_csr = build_icm_csr(tracks)
urm_csr = build_urm_csr(training_set)
test_set_playlists = np.unique(test_set_csr.nonzero()[0])

ValueError: row index exceeds matrix dimensions

In [8]:
test_set_csr.shape

(50442, 20633)

# EVALUATION METHODS

In [6]:
def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
       
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score


def evaluate_algorithm(URM_test, recommender_object, target_playlists, at=10, alfa = 0.9):
    
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0


    result = []
    
    for user_id in target_playlists:
    
        target_items = URM_test.getrow(user_id).indices
        
        recommended_items = recommender_object.recommend(user_id, at=at, alfa = alfa)
        num_eval+=1
        
        cumulative_precision += precision(recommended_items, target_items)
        cumulative_recall += recall(recommended_items, target_items)
        cumulative_MAP += MAP(recommended_items, target_items)
        
        recommendation_string = " ".join(str(i) for i in recommended_items)
        temp = [user_id,recommendation_string]
        result.append(temp)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    rec = pd.DataFrame(result)
    rec.to_csv("sample_submission.csv", index = False, header = ["playlist_id", "track_ids"])
    print("Recommender performance is: Precision = {:.6f}, Recall = {:.6f}, MAP = {:.6f}".format(cumulative_precision, cumulative_recall, cumulative_MAP))
    
    

# ALGORITHM

In [7]:
class EnsembleRecommender(object):
    def get_URM_train(self):
        return self.URM_csr
    def fit(self, URM_csr, ICM_csr, min_common_features = 5):
        self.min_common_features = min_common_features
        transformer = TfidfTransformer()
        transformer.fit(URM_csr)
        tf_idf_csr = transformer.transform(URM_csr)

        IRM = sparse.csr_matrix(tf_idf_csr.transpose())
        
        csr_similarities = sparse.csr_matrix(cosine_similarity(IRM, dense_output=False))
        

        transformer.fit(ICM_csr)
        tf_idf_icm = transformer.transform(ICM_csr)
        icm_similarities = sparse.csr_matrix(cosine_similarity(tf_idf_icm, dense_output=False))
        
        print("COMPUTING ENSEMBLED CONTENT SIMILARITIES")
        #self.item_similarities = alfa*csr_similarities + (1-alfa)*icm_similarities  
        
        # Get common structure
        W_sparse_CF_structure = icm_similarities.copy()
        W_sparse_CF_structure.data = np.ones_like(W_sparse_CF_structure.data)

        W_sparse_CBF_structure = csr_similarities.copy()
        W_sparse_CBF_structure.data = np.ones_like(W_sparse_CBF_structure.data)

        W_sparse_common = W_sparse_CF_structure.multiply(W_sparse_CBF_structure)

        # Get values of both in common structure of CF
        W_sparse_delta = icm_similarities.multiply(W_sparse_common)
        W_sparse_delta -= csr_similarities.multiply(W_sparse_common)
        
        W_sparse_delta_sorted = np.sort(W_sparse_delta.data.copy())
        
        print("CREATING CFW...")
        self.CFW_weithing = CFW_D_Similarity_Linalg(URM_csr, ICM_csr, csr_similarities)
        print("FITTING CFW...")
        self.CFW_weithing.fit()
        self.URM_csr = URM_csr
        
    
    def recommend(self, user_id, at=10, remove_seen_flag=True, alfa = 0.9):
        
        user = self.URM_csr.getrow(user_id)
        itemPopularity = user.dot(self.icm_similarities) + user.dot(self.csr_similarities)
        popularItems = np.argsort(np.array(itemPopularity.todense())[0])
        popularItems = np.flip(popularItems, axis = 0)

        if remove_seen_flag:
            unseen_items_mask = np.in1d(popularItems, self.URM_csr[user_id].indices,
                                        assume_unique=True, invert = True)

            unseen_items = popularItems[unseen_items_mask]
            
            recommended_items = unseen_items[0:at]

        else:
            recommended_items = popularItems[0:at]
            
        #recommended_items = " ".join(str(i) for i in recommended_items)
        return recommended_items

# Testing algorithm

In [11]:
ensemble = EnsembleRecommender()
test_cf = [1,2,3,4,5,6,7,8,9,10]
print("FITTING...")
ensemble.fit(urm_csr,icm_csr)

#evaluate_algorithm(test_set_csr, ensemble, test_set_playlists, alfa = test)
    

FITTING...
COMPUTING ENSEMBLED CONTENT SIMILARITIES
CREATING CFW...
FITTING CFW...
CFW_D_Similarity_Linalg: Generating train data
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 20600 ( 100 % ), 2067.99 column/sec, elapsed time 0.17 min
CFW_D_Similarity_Linalg: Collaborative S density: 3.16E-02, nonzero cells 13437460
CFW_D_Similarity_Linalg: Content S density: 5.43E-04, nonzero cells 231018
CFW_D_Similarity_Linalg: Content S structure has 166784 out of 231018 ( 72.20%) nonzero collaborative cells
CFW_D_Similarity_Linalg: Nonzero collaborative cell sum is: 2.45E+04, average is: 1.47E-01, average over all collaborative data is 2.24E-02
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 20600 ( 100 % ), 1512.19 column/sec, elapsed time 0.23 min


In [13]:
for test in test_cf:
    evaluator_test = SequentialEvaluator(test_set_csr, cutoff_list=[test])
    print("EVALUATING...")
    results_dict, _ = evaluator_test.evaluateRecommender(ensemble.CFW_weithing)
    print(results_dict)

EVALUATING...
SequentialEvaluator: Processed 7757 ( 100.00% ) in 3.72 seconds. Users per second: 2085
{1: {'ROC_AUC': 0.11602423617377852, 'PRECISION': 0.11602423617377852, 'RECALL': 0.011602423617377745, 'RECALL_TEST_LEN': 0.11602423617377852, 'MAP': 0.11602423617377852, 'MRR': 0.11602423617377852, 'NDCG': 0.025535978360536928, 'F1': 0.021095315667959556, 'HIT_RATE': 0.11602423617377852, 'ARHR': 0.11602423617377852, 'NOVELTY': 0.000695790676895218, 'DIVERSITY_MEAN_INTER_LIST': 0.998419135708199, 'DIVERSITY_HERFINDAHL': 0.9982904236886414, 'COVERAGE_ITEM': 0.12077739543449813, 'COVERAGE_USER': 0.153792775288473, 'DIVERSITY_GINI': 0.45549149065106936, 'SHANNON_ENTROPY': 10.31636456473925}}
EVALUATING...
SequentialEvaluator: Processed 7757 ( 100.00% ) in 3.95 seconds. Users per second: 1964
{2: {'ROC_AUC': 0.11602423617377852, 'PRECISION': 0.11028748227407503, 'RECALL': 0.022057496454814522, 'RECALL_TEST_LEN': 0.11028748227407503, 'MAP': 0.09507541575351296, 'MRR': 0.14644836921490267, '

IndexError: index 20634 is out of bounds for axis 0 with size 20633

# TESTING SINGLE ITERATION