In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

from recsys.utility import *

#RANDOM_STATE = 666

#np.random.seed(RANDOM_STATE)

%matplotlib inline

In [None]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7):
    """
        Standard train_test_split, no modifications.
    """
    playlists = train.groupby('playlist_id').count()

    # Only playlists with at least "min_playlist_tracks" tracks are considered.
    # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
    to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values


    # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
    # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
    # It's a numpy array that contains playlis_ids.
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
        # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
        selected_df = train[train['playlist_id'] == p].sample(5)

        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)

    test = train.loc[indexes].copy()
    train = train.drop(indexes)

    return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [None]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0):
    """
        Produces the product between matrices m1 and m2.
        Possible similarities: "dot", "cosine". By default it goes on "dot".
        NB: Shrinkage is not implemented...
        Code taken from
            https://stackoverflow.com/questions/29647326/sparse-matrix-dot-product-keeping-only-n-max-values-per-result-row
            and optimized for smart dot products.
    """
    m2_transposed = m2.transpose()
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                else:
                    res_rows = rows.dot(m2)
                if shrinkage > 0:
                    res_rows = apply_shrinkage(rows, res_rows, shrinkage)
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            final_rows.append(def_rows_g[0])
                else:
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [None]:
def make_predictions(test=None, target_playlists=None, compute_MAP=False, row_group=100):
    """
        Produces a prediction dataframe for "test", where each row corresponds to a playlist in "target_playlists".
        If compute_MAP is true, then it print the MAP every "row_group" playlists.
        It's optimized for doing dot products for different playlist at once.
            "row_group" is the number of playlists in each of these optimized dot products.
            The higher is row_group, the faster are the predictions but more memory is used.
    """
    # Create predictions dataframe
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
    ttracks = set(target_tracks['track_id'].values)
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
        print(len(test_good))
    
    # This is the sum of all the AP of the playlists.
    # When we print the MAP, we divide "sum_ap" by the number of considered playlists.
    sum_ap = 0
    
    # Let's start the predictions!
    row_start = 0
    while row_start < len(target_playlists):
        # We'll do dot products for all playlists in "target_playlists" from "row_start" to "row_end"
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]
        
        # Now we need to build a matrix where, for each playlist in "pl_group", we take the correspondent URM row slice
        rows_URM_sqrt = []
        for pl_id in pl_group.playlist_id:
            rows_URM_sqrt += [URM_sqrt[pl_id,:]]
        composed_URM_sqrt = scipy.sparse.vstack(rows_URM_sqrt, 'csr')
        
        # Compute predictions for current playlist group: here we do all the smart dot products...
        # "simil_ii" are the scores for playlists in common
        # "simil_album" and "simil_artist" are scores for albums and artists (captain obvious)
        #simil_ii = np.array(np.divide(TTM.dot(composed_URM_sqrt.transpose()).transpose().todense(), TTM.sum(axis=1).transpose()))
        simil_album = np.array(np.divide(SYM_ALBUM.dot(composed_URM_sqrt.transpose()).transpose().todense(), SYM_ALBUM.sum(axis=1).transpose()))
        simil_artist = np.array(np.divide(SYM_ARTIST.dot(composed_URM_sqrt.transpose()).transpose().todense(), SYM_ARTIST.sum(axis=1).transpose()))

        # Now we should consider one playlist at a time, take its own personalized parameters and make the prediction
        for i,pl_id in enumerate(pl_group.playlist_id):
            # Retrieve parameters
            ii_param = playlist_params.loc[pl_id].ii_param_norm
            album_param = playlist_params.loc[pl_id].album_param_norm
            artist_param = playlist_params.loc[pl_id].artist_param_norm
            
            # Tracks that we know are in the playlist (so we shouldn't recommend them)
            pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])

            pred = []
            
            # If you want to do some testing only on specific features, put the weights of the other features to zero.
            #ii_param = 0
            #album_param = 1
            #artist_param = 0
            
            # Combine all the predictions and sort them from best to worst
            simil = ii_param * simil_ii[i] + album_param * simil_album[i] + artist_param * simil_artist[i]
            sorted_ind = simil.argsort()[::-1]

            # Predict...
            i = 0
            c = 0
            while i < len(sorted_ind) and c < 5:
                tr = sorted_ind[i]
                if (tr in ttracks) and (tr not in pl_tracks):
                    pred.append(num_to_tracks[tr])
                    c+=1
                i+=1
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', np.array(pred))
            
            # Update MAP
            if compute_MAP:
                correct = 0
                ap = 0
                for it, t in enumerate(pred):
                    tr_ids = test_good.loc[pl_id]['track_ids']
                    if t in tr_ids:
                        correct += 1
                        ap += correct / (it+1)
                ap /= len(pred)
                sum_ap += ap
        
        # Update "row_start" to "row_end" and proceed to next pl_group
        row_start = row_end
        
        print(row_start)
        if compute_MAP:
            print(sum_ap / row_start)
            
    #predictions['playlist_id'] = predictions['playlist_id_tmp']
    return predictions

In [None]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [None]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [None]:
# We load them just to compare the ones for testing with the original ones.
# NB: we shouldn't use them in training!
train_original = pd.read_csv('data/train_final.csv', delimiter='\t')
target_playlists_original = pd.read_csv('data/target_playlists.csv', delimiter='\t')

In [None]:
len(train), len(target_playlists), len(target_tracks)

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=0.4, min_playlist_tracks=10)

In [None]:
len(train), len(test), len(target_playlists), len(target_tracks)

In [None]:
full_target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
print("Number of playlists in the new target_playlists that are also in the original target_playlists")
print(len(target_playlists[target_playlists.playlist_id.isin(full_target_playlists.playlist_id)]))

# Process data

In [None]:
# Almost all of these were taken from one of your notebook, so you probably understand them
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])


train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

# Substitute each bad album (i.e. an illformed album such as -1, None, etc) to a new album
bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [-1]
        bad_albums += 1
    return ar[0]
def transform_album_2(alb):
    global next_album_id
    if alb == -1:
        alb = next_album_id
        next_album_id += 1
    return alb
tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))
last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

# Target playlists analysis

# Training

## II
"II" means Item-Item collaborative filtering, i.e. playlists in common...

Steps:
1 - Create a URM (URM_sqrt) normalized with a modified IDF which has a sqrt.
2 - Compute TTM as URM_sqrt.dot(URM_sqrt.transpose()). Keep the K best for each row.
3 - Compute personalized parameters for each playlist. Here we compute the ii_parameter, which indicates how much a playlist is affine to be predicted using the TTM. This is done by doing the following things for each row:
    - compute a np.array by doing the sum of all the rows in the TTM that corresponds to a track in the considered playlist
    - compute the ii_parameter of the playlist by doing 1/(entropy_of_the_computed_array + 0.05). "0.05" is needed since it may happens that the entropy is zero and so the ratio goes to infinity.

In [None]:
# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no"):
    """
        possible normalizations: "no", "magnitude", "idf", "sqrt". Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((num_playlists - nq + 0.5)/(nq + 0.5))
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((num_playlists - nq + 0.5)/(nq + 0.5))
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
        
    if norm == "magnitude":
        for pl_id in playlists.playlist_id:
            magnitude = math.sqrt(len(URM.data[pl_id]))
            for col in URM.rows[pl_id]:
                URM[pl_id,col] /= magnitude
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [None]:
#URM_no_norm = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no")

In [None]:
URM_sqrt = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="sqrt")

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 10000
def_rows_i = URM_sqrt.transpose()[0:row_group].dot(URM_sqrt) # this is needed to fill some rows that would be all zeros otherwise...
TTM = dot_with_top(URM_sqrt.transpose(), URM_sqrt, def_rows_i, top=100, row_group=row_group, similarity="cosine", shrinkage=0)

In [None]:
playlist_params = pd.DataFrame(playlist_tracks.playlist_id)

In [None]:
# Step 3: compute how much each playlist is affine to be classified with such similarity concept
playlist_params['ii_param'] = 0.0

counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    trks = playlist_tracks.loc[pl_id].track_ids
    tot = np.zeros((1,TTM.shape[0]))[0]
    for tr_id in trks:
        tot += TTM[tr_id].toarray()[0]
    v = 1 / (scipy.stats.entropy(tot + 0.05))
    playlist_params.set_value(pl_id, "ii_param", v)
    counter += 1
    if counter % 500 == 0:
        print(counter)

In [None]:
playlist_params

## Album

<div style="white-space: pre-wrap;">
Steps:
1 - Compute the playlists_x_albums (i.e. the UAM_album matrix, where U stands for User) sparse matrix. I do this before computing the tracks_x_albums (i.e. the IAM_album matrix, where I stands for Item) sparse matrix because here I compute also the "album_to_val" dictionary, which contains the IDF value of each album obtained considering the playlists as document (and not the tracks). However at the moment I don't use this because I compute the IAM_album matrix without any normalization, so you may skip it...
2 - Compute the tracks_x_albums IAM_album sparse matrix.
3 - Compute the SYM_ALBUM tracks_x_tracks matrix by doing IAM_album.dot(IAM_album.transpose()). It's not big, so I don't need to keep the K best values...
4 - Compute the album_parameter, which means "how much each playlist is affine to album similarity". I do this by computing the entropy of the numpy array containing the occurrences of the albums in the playlist, and then doing 1/(entropy_of_array + 0.05).
</div>

In [None]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [None]:
# Step 1
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no")

In [None]:
unique_albums = tracks.album.unique()
unique_albums

In [None]:
def get_IAM_album(tracks, target_tracks, norm="no"):
    """
        Possible norms are "no", "idf". Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        nq = 1
        if norm == "idf":
            if row.album in album_to_val:
                IAM_album[row.track_id,row.album] = album_to_val[row.album]
            else:
                IAM_album[row.track_id,row.album] = 0 # Give zero if the album is not in any playlist!
        else:
            IAM_album[row.track_id,row.album] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return IAM_album

In [None]:
# Step 2
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

In [None]:
# Step 3
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

In [None]:
# Step 4: compute how much each user is affine to album similarity
playlist_params['album_param'] = 0.0

UAM_album_no_norm_csc = UAM_album_no_norm.tocsr()
counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_album_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "album_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

In [None]:
playlist_params

## Artist
Same steps as for Album

In [None]:
# User Artist Matrix UAM
def get_UAM(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_artists = tracks.artist_id.unique()
    
    i = 0

    UAM = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    UAM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    artist_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            art = tracks.loc[tr_id].artist_id
            UAM[pl_id,art] += 1
            UAM_no_norm[pl_id,art] += 1
            if art not in artist_to_playlists:
                artist_to_playlists[art] = [pl_id]
            else:
                artist_to_playlists[art].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    artist_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            artists = UAM.rows[pl_id]
            data = UAM.data[pl_id]
            for artist in artists:
                fq = UAM[pl_id,artist]
                nq = len(artist_to_playlists[artist])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if artist not in artist_to_val:
                    artist_to_val[artist] = idf
                
                if norm == "idf":
                    UAM[pl_id,artist] = idf
                else:
                    UAM[pl_id,artist] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM, UAM_no_norm, artist_to_val

In [None]:
# Step 1
UAM, UAM_no_norm, artist_to_val = get_UAM(tracks, playlist_tracks, target_playlists, norm="no")

In [None]:
unique_artists = tracks.artist_id.unique()

In [None]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no"):
    """
        Possible norms are "no", "idf". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        if norm == "idf":
            if row.artist_id in artist_to_val:
                IAM[row.track_id,row.artist_id] = artist_to_val[row.artist_id]
            else:
                IAM[row.track_id,row.artist_id] = 0 # Give zero if the album is not in any playlist!
        else:
            IAM[row.track_id,row.artist_id] = 1
            
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return IAM

In [None]:
# Step 2
IAM = get_IAM(tracks, target_tracks, norm="no")

In [None]:
# Step 3
SYM_ARTIST = IAM.dot(IAM.transpose())

In [None]:
UAM_csc = UAM.tocsc()
UAM_no_norm_csc = UAM_no_norm.tocsc()
IAM_csr_transpose = IAM.tocsr().transpose()

In [None]:
# Step 4: compute how much each user is affine to artist similarity
playlist_params['artist_param'] = 0.0

counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "artist_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

In [None]:
playlist_params

## Adjust params

Here I just want to adjust the parameters so that they can be compaired... I do zeta scoring to them and put the means to 1. We should do some more experiments here.

In [None]:
# Execute only once
playlist_params_copy = playlist_params.copy(deep=True)

In [None]:
# Restore playlist_params
playlist_params = playlist_params_copy.copy(deep=True)

In [None]:
playlist_params

In [None]:
playlist_params["ii_param_norm"] = playlist_params.ii_param.clip(0)
playlist_params["album_param_norm"] = playlist_params.album_param.clip(0)
playlist_params["artist_param_norm"] = playlist_params.artist_param.clip(0)

In [None]:
#playlist_params["ii_param_norm"] = np.sqrt(playlist_params.ii_param_norm)
#playlist_params["album_param_norm"] = np.sqrt(playlist_params.album_param_norm)
#playlist_params["artist_param_norm"] = np.sqrt(playlist_params.artist_param_norm)

In [None]:
playlist_params.describe()

In [None]:
playlist_params["ii_param_norm"] = ((playlist_params.ii_param_norm - playlist_params.ii_param_norm.mean()) / playlist_params.ii_param_norm.std()) + 1
playlist_params["album_param_norm"] = ((playlist_params.album_param_norm - playlist_params.album_param_norm.mean()) / playlist_params.album_param_norm.std()) + 1
playlist_params["artist_param_norm"] = ((playlist_params.artist_param_norm - playlist_params.artist_param_norm.mean()) / playlist_params.artist_param_norm.std()) + 1

In [None]:
#playlist_params.album_param_norm -= 0.2
#playlist_params.artist_param_norm -= 0.4

In [None]:
playlist_params

In [None]:
# ii_param wins against other parameters
len(playlist_params[(playlist_params.ii_param_norm >= playlist_params.album_param_norm) & (playlist_params.ii_param_norm >= playlist_params.artist_param_norm)])

In [None]:
# album_param wins
len(playlist_params[(playlist_params.album_param_norm >= playlist_params.ii_param_norm) & (playlist_params.album_param_norm >= playlist_params.artist_param_norm)])

In [None]:
# artist_param wins
len(playlist_params[(playlist_params.artist_param_norm >= playlist_params.album_param_norm) & (playlist_params.artist_param_norm >= playlist_params.ii_param_norm)])

# Predictions

In [None]:
len(test)

In [None]:
len(test[test.playlist_id.isin(target_playlists_original.playlist_id)])

In [None]:
# Predictions for all the playlists in test
make_predictions(test=test, target_playlists=target_playlists, compute_MAP=True, row_group=1000)

In [None]:
# Predictions for playlists in test that were also in the original target_playlists
predictions = make_predictions(test=test[test.playlist_id.isin(target_playlists_original.playlist_id)],
                               target_playlists=target_playlists[target_playlists.playlist_id_tmp.isin(target_playlists_original.playlist_id)],
                               compute_MAP=True, row_group=1000)

In [None]:
# Predictions for playlists in test not in the original target_playlists
predictions = make_predictions(test=test[~test.playlist_id.isin(target_playlists_original.playlist_id)],
                               target_playlists=target_playlists[~target_playlists.playlist_id_tmp.isin(target_playlists_original.playlist_id)],
                               compute_MAP=True, row_group=1000)

In [None]:
predictions.head()

In [None]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [None]:
evaluate(test[~test.playlist_id.isin(target_playlists_original.playlist_id)], predictions)

In [None]:
predictions.head()

In [None]:
predictions = predictions.drop("playlist_id_tmp", axis=1)

In [None]:
# Make the dataframe friendly for output -> convert np.array in string
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))
predictions.to_csv('results.csv', index=False)