In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

RANDOM_STATE = 123

np.random.seed(RANDOM_STATE)

%matplotlib inline

In [2]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7, pessimistic=False):
    if pessimistic:
        train_mask = np.random.choice([True,False], len(train), p=[1-test_size, test_size])
        new_train = train[train_mask]
        new_test = train[~train_mask]
        pl_ids = new_test.playlist_id.unique()
        pl_ids.sort()
        new_target_playlist = pd.DataFrame({"playlist_id" : pl_ids}, index=pl_ids)
        tr_ids = new_test.track_id.unique()
        tr_ids.sort()
        new_target_tracks = pd.DataFrame({"track_id" : tr_ids}, index=tr_ids)
        return new_train, new_test, new_target_playlist, new_target_tracks
        
    else:
        playlists = train.groupby('playlist_id').count()

        # Only playlists with at least "min_playlist_tracks" tracks are considered.
        # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
        to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values

        # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
        # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
        # It's a numpy array that contains playlis_ids.
        target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

        target_tracks = np.array([])
        indexes = np.array([])
        for p in target_playlists:
            # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
            # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
            selected_df = train[train['playlist_id'] == p].sample(5)

            selected_tracks = selected_df['track_id'].values
            target_tracks = np.union1d(target_tracks, selected_tracks)
            indexes = np.union1d(indexes, selected_df.index.values)

        test = train.loc[indexes].copy()
        train = train.drop(indexes)

        return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [3]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0):
    """
        Possible similarities: "dot", "cosine". By default it goes on "dot".
    """
    #cdef int row_id
    #cdef int last_row
    
    m2_transposed = m2.transpose()
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                else:
                    res_rows = rows.dot(m2)
                if shrinkage > 0:
                    res_rows = apply_shrinkage(rows, res_rows, shrinkage)
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            final_rows.append(def_rows_g[0])
                else:
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [229]:
def make_predictions(test=None, compute_MAP=False, row_group=100):
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
    ttracks = set(target_tracks['track_id'].values)
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
        print(len(test_good))
    
    mean_ap = 0
    
    row_start = 0
    while row_start < len(target_playlists):
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        pl_group = target_playlists[row_start:row_end]
        rows_URM_sqrt = []
        rows_URM_idf = []
        for pl_id in pl_group.playlist_id:
            rows_URM_sqrt += [URM_sqrt[pl_id,:]]
            rows_URM_idf += [URM_idf[pl_id,:]]
        composed_URM_sqrt = scipy.sparse.vstack(rows_URM_sqrt, 'csr')
        composed_URM_idf = scipy.sparse.vstack(rows_URM_idf, 'csr')
        
        # Compute predictions for current playlist group
        simil_ii = np.array(np.divide(TTM.dot(composed_URM_idf.transpose()).transpose().todense(), TTM.sum(axis=1).transpose()))
        simil_album = np.array(np.divide(SYM_ALBUM.dot(composed_URM_sqrt.transpose()).transpose().todense(), SYM_ALBUM.sum(axis=1).transpose()))
        simil_artist = np.array(np.divide(SYM_ARTIST.dot(composed_URM_sqrt.transpose()).transpose().todense(), SYM_ARTIST.sum(axis=1).transpose()))
        
        #simil_album = np.array(np.dot(composed_UAM_album, IAM_album_csr_transpose).todense())
        #simil_artist = np.array(np.dot(composed_UAM, IAM_csr_transpose).todense())
        for i,pl_id in enumerate(pl_group.playlist_id):
            # Choose parameters
            ii_param = playlist_params.loc[pl_id].ii_param_norm
            album_param = playlist_params.loc[pl_id].album_param_norm
            artist_param = playlist_params.loc[pl_id].artist_param_norm
            
            pred = []
            pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])
            
            #ii_param = 1
            #album_param = 0
            #artist_param = 0
            simil = ii_param * simil_ii[i] + album_param * simil_album[i] + artist_param * simil_artist[i]
            sorted_ind = simil.argsort()[::-1]

            i = 0
            c = 0
            while i < len(sorted_ind) and c < 5:
                #tr = from_num_to_id(target_tracks, sorted_ind[i], column='track_id')
                tr = sorted_ind[i]
                if (tr in ttracks) and (tr not in pl_tracks):
                    pred.append(num_to_tracks[tr])
                    c+=1
                i+=1
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', np.array(pred))

            # Update MAP
            if compute_MAP:
                correct = 0
                ap = 0
                for it, t in enumerate(pred):
                    tr_ids = test_good.loc[pl_id]['track_ids']
                    if t in tr_ids:
                        correct += 1
                        ap += correct / (it+1)
                ap /= len(pred)
                mean_ap += ap
        
        row_start = row_end
        print(row_start)
        if compute_MAP:
            print(mean_ap / row_start)
            
    predictions['playlist_id'] = predictions['playlist_id_tmp']
    return predictions

In [5]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [6]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [7]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=0.30, min_playlist_tracks=10, pessimistic=False)

# Process data

In [8]:
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])


train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [-1]
        bad_albums += 1
    return ar[0]

def transform_album_2(alb):
    global next_album_id
    if alb == -1:
        alb = next_album_id
        next_album_id += 1
    return alb
    
tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))

last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

# Training

## II

In [167]:
# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no"):
    """
        possible normalizations: "no", "magnitude", "idf", "sqrt". Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((num_playlists - nq + 0.5)/(nq + 0.5))
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((num_playlists - nq + 0.5)/(nq + 0.5))
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
        
    if norm == "magnitude":
        for pl_id in playlists.playlist_id:
            magnitude = math.sqrt(len(URM.data[pl_id]))
            for col in URM.rows[pl_id]:
                URM[pl_id,col] /= magnitude
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [173]:
URM_sqrt = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="sqrt")
URM_idf = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="idf")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000


In [174]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 10000
def_rows_i = URM_idf.transpose()[0:row_group].dot(URM_idf)
TTM = dot_with_top(URM_idf.transpose(), URM_idf, def_rows_i, top=100, row_group=row_group, similarity="cosine", shrinkage=0)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [180]:
# Step 3: compute how much each playlist is affine to be classified with such similarity concept
playlist_params = pd.DataFrame(playlist_tracks.playlist_id)

In [213]:
playlist_params['ii_param'] = 0.0

counter = 0
for pl_id in playlist_tracks.playlist_id:
    trks = playlist_tracks.loc[pl_id].track_ids
    tot = np.zeros((1,TTM.shape[0]))[0]
    for tr_id in trks:
        tot += TTM[tr_id].toarray()[0]
    v = 1 / (scipy.stats.entropy(tot + 0.05))
    playlist_params.set_value(pl_id, "ii_param", v)
    counter += 1
    if counter % 500 == 0:
        print(counter)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500


In [None]:
playlist_params

## Album

In [31]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [32]:
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="idf")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [33]:
unique_albums = tracks.album.unique()
unique_albums

array([     7,      8,      9, ..., 244079, 244080, 244081])

In [36]:
def get_IAM_album(tracks, target_tracks, norm="no"):
    """
        Possible norms are "no", "idf". Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        nq = 1
        if norm == "idf":
            if row.album in album_to_val:
                IAM_album[row.track_id,row.album] = album_to_val[row.album]
            else:
                IAM_album[row.track_id,row.album] = 0 # Give zero if the album is not in any playlist!
        else:
            IAM_album[row.track_id,row.album] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return IAM_album

In [83]:
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [143]:
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

In [214]:
# Step 3: compute how much each user is affine to album similarity
playlist_params['album_param'] = 0.0

UAM_album_no_norm_csc = UAM_album_no_norm.tocsr()
counter = 0
for pl_id in playlist_tracks.playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_album_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "album_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

5000
10000
15000
20000
25000
30000
35000
40000
45000


In [215]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param
0,0,0.087245,0.328328
1,1,0.140688,0.542959
2,2,0.140624,0.501025
3,3,0.087040,0.444993
4,4,0.138141,0.253117
5,5,0.087247,0.418695
6,6,0.087648,0.296516
7,7,0.088451,0.252396
8,8,0.087418,0.401443
9,9,0.089527,0.257517


## Artist

In [46]:
# User Artist Matrix UAM
def get_UAM(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_artists = tracks.artist_id.unique()
    
    i = 0

    UAM = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    UAM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    artist_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            art = tracks.loc[tr_id].artist_id
            UAM[pl_id,art] += 1
            UAM_no_norm[pl_id,art] += 1
            if art not in artist_to_playlists:
                artist_to_playlists[art] = [pl_id]
            else:
                artist_to_playlists[art].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    artist_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            artists = UAM.rows[pl_id]
            data = UAM.data[pl_id]
            for artist in artists:
                fq = UAM[pl_id,artist]
                nq = len(artist_to_playlists[artist])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if artist not in artist_to_val:
                    artist_to_val[artist] = idf
                
                if norm == "idf":
                    UAM[pl_id,artist] = idf
                else:
                    UAM[pl_id,artist] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM, UAM_no_norm, artist_to_val

In [47]:
UAM, UAM_no_norm, artist_to_val = get_UAM(tracks, playlist_tracks, target_playlists, norm="idf")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [48]:
unique_artists = tracks.artist_id.unique()

In [49]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no"):
    """
        Possible norms are "no", "idf". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        if norm == "idf":
            if row.artist_id in artist_to_val:
                IAM[row.track_id,row.artist_id] = artist_to_val[row.artist_id]
            else:
                IAM[row.track_id,row.artist_id] = 0 # Give zero if the album is not in any playlist!
        else:
            IAM[row.track_id,row.artist_id] = 1
            
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return IAM

In [79]:
IAM = get_IAM(tracks, target_tracks, norm="no")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [147]:
SYM_ARTIST = IAM.dot(IAM.transpose())

In [216]:
UAM_csc = UAM.tocsc()
UAM_no_norm_csc = UAM_no_norm.tocsc()
IAM_csr_transpose = IAM.tocsr().transpose()

In [217]:
# Step 3: compute how much each user is affine to artist similarity
playlist_params['artist_param'] = 0.0

counter = 0
for pl_id in playlist_tracks.playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "artist_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

5000
10000
15000
20000
25000
30000
35000
40000
45000


In [218]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param,artist_param
0,0,0.087245,0.328328,0.328328
1,1,0.140688,0.542959,0.620844
2,2,0.140624,0.501025,0.501025
3,3,0.087040,0.444993,0.444993
4,4,0.138141,0.253117,0.288358
5,5,0.087247,0.418695,0.528712
6,6,0.087648,0.296516,0.319708
7,7,0.088451,0.252396,0.254175
8,8,0.087418,0.401443,0.471876
9,9,0.089527,0.257517,0.332402


## Adjust params

In [219]:
# Execute only once
playlist_params_copy = playlist_params.copy(deep=True)

In [231]:
# Restore playlist_params
playlist_params = playlist_params_copy.copy(deep=True)

In [232]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param,artist_param
0,0,0.087245,0.328328,0.328328
1,1,0.140688,0.542959,0.620844
2,2,0.140624,0.501025,0.501025
3,3,0.087040,0.444993,0.444993
4,4,0.138141,0.253117,0.288358
5,5,0.087247,0.418695,0.528712
6,6,0.087648,0.296516,0.319708
7,7,0.088451,0.252396,0.254175
8,8,0.087418,0.401443,0.471876
9,9,0.089527,0.257517,0.332402


In [233]:
playlist_params["ii_param_norm"] = playlist_params.ii_param.clip(0)
playlist_params["album_param_norm"] = playlist_params.album_param.clip(0)
playlist_params["artist_param_norm"] = playlist_params.artist_param.clip(0)

In [234]:
#playlist_params["ii_param_norm"] = np.sqrt(playlist_params.ii_param_norm)
#playlist_params["album_param_norm"] = np.sqrt(playlist_params.album_param_norm)
#playlist_params["artist_param_norm"] = np.sqrt(playlist_params.artist_param_norm)

In [235]:
playlist_params.describe()

Unnamed: 0,playlist_id,ii_param,album_param,artist_param,ii_param_norm,album_param_norm,artist_param_norm
count,45649.0,45649.0,45649.0,45649.0,45649.0,45649.0,45649.0
mean,28495.440382,0.10628,2.222896,3.575682,0.323542,1.039441,1.312165
std,16694.122188,0.027173,5.454396,7.01418,0.040012,1.06887,1.361597
min,0.0,0.086867,0.190819,0.190935,0.294733,0.436828,0.436961
25%,14066.0,0.086996,0.327357,0.362575,0.29495,0.572151,0.602142
50%,28229.0,0.087468,0.483135,0.542959,0.29575,0.695079,0.736858
75%,43446.0,0.14022,0.773722,0.99973,0.374459,0.879615,0.999865
max,57559.0,0.168749,20.0,20.0,0.410791,4.472136,4.472136


In [236]:
playlist_params["ii_param_norm"] = ((playlist_params.ii_param_norm - playlist_params.ii_param_norm.mean()) / playlist_params.ii_param_norm.std()) + 1
playlist_params["album_param_norm"] = ((playlist_params.album_param_norm - playlist_params.album_param_norm.mean()) / playlist_params.album_param_norm.std()) + 1
playlist_params["artist_param_norm"] = ((playlist_params.artist_param_norm - playlist_params.artist_param_norm.mean()) / playlist_params.artist_param_norm.std()) + 1

In [62]:
#playlist_params.album_param_norm -= 0.2
#playlist_params.artist_param_norm -= 0.4

In [237]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param,artist_param,ii_param_norm,album_param_norm,artist_param_norm
0,0,0.087245,0.328328,0.328328,0.295999,0.563613,0.457134
1,1,0.140688,0.542959,0.620844,2.288148,0.716913,0.614990
2,2,0.140624,0.501025,0.501025,2.286015,0.689757,0.556158
3,3,0.087040,0.444993,0.444993,0.287306,0.651630,0.526228
4,4,0.138141,0.253117,0.288358,2.202906,0.498224,0.430687
5,5,0.087247,0.418695,0.528712,0.296094,0.632908,0.570329
6,6,0.087648,0.296516,0.319708,0.313032,0.536981,0.451572
7,7,0.088451,0.252396,0.254175,0.346853,0.497553,0.406574
8,8,0.087418,0.401443,0.471876,0.303290,0.620304,0.540809
9,9,0.089527,0.257517,0.332402,0.391901,0.502297,0.459736


In [238]:
# ii_param wins
len(playlist_params[(playlist_params.ii_param_norm >= playlist_params.album_param_norm) & (playlist_params.ii_param_norm >= playlist_params.artist_param_norm)])

14290

In [239]:
# album_param wins
len(playlist_params[(playlist_params.album_param_norm >= playlist_params.ii_param_norm) & (playlist_params.album_param_norm >= playlist_params.artist_param_norm)])

26968

In [240]:
# artist_param wins
len(playlist_params[(playlist_params.artist_param_norm >= playlist_params.album_param_norm) & (playlist_params.artist_param_norm >= playlist_params.ii_param_norm)])

4391

# Predictions

In [241]:
predictions = make_predictions(test=test, compute_MAP=True, row_group=1000)

6630
1000
0.09732000000000003
2000
0.09495999999999975


KeyboardInterrupt: 