In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

#from recsys.utility import *

#RANDOM_STATE = 666

#np.random.seed(RANDOM_STATE)

%matplotlib inline

In [2]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7):
    """
        Standard train_test_split, no modifications.
    """
    playlists = train[train.playlist_id.isin(target_playlists_original.playlist_id)].groupby('playlist_id').count()

    # Only playlists with at least "min_playlist_tracks" tracks are considered.
    # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
    to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values


    # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
    # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
    # It's a numpy array that contains playlis_ids.
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
        # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
        selected_df = train[train['playlist_id'] == p].sample(5)

        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)

    test = train.loc[indexes].copy()
    train = train.drop(indexes)

    return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [3]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0):
    """
        Produces the product between matrices m1 and m2.
        Possible similarities: "dot", "cosine". By default it goes on "dot".
        NB: Shrinkage is not implemented...
        Code taken from
            https://stackoverflow.com/questions/29647326/sparse-matrix-dot-product-keeping-only-n-max-values-per-result-row
            and optimized for smart dot products.
    """
    m2_transposed = m2.transpose()
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                else:
                    res_rows = rows.dot(m2)
                if shrinkage > 0:
                    res_rows = apply_shrinkage(rows, res_rows, shrinkage)
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            final_rows.append(def_rows_g[0])
                else:
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [5]:
def make_predictions(test=None, target_playlists=None, compute_MAP=False, row_group=100):
    """
        Produces a prediction dataframe for "test", where each row corresponds to a playlist in "target_playlists".
        If compute_MAP is true, then it print the MAP every "row_group" playlists.
        It's optimized for doing dot products for different playlist at once.
            "row_group" is the number of playlists in each of these optimized dot products.
            The higher is row_group, the faster are the predictions but more memory is used.
    """
    # Create predictions dataframe
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
    ttracks = set(target_tracks['track_id'].values)
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
        print(len(test_good))
    
    # This is the sum of all the AP of the playlists.
    # When we print the MAP, we divide "sum_ap" by the number of considered playlists.
    sum_ap = 0
    
    # Let's start the predictions!
    row_start = 0
    while row_start < len(target_playlists):
        # We'll do dot products for all playlists in "target_playlists" from "row_start" to "row_end"
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]
        
        # Now we need to build a matrix where, for each playlist in "pl_group", we take the correspondent URM row slice
        rows_URM_ii = []
        rows_URM_album = []
        rows_URM_artist = []
        for pl_id in pl_group.playlist_id:
            rows_URM_ii += [URM_ii[pl_id,:]]
            rows_URM_album += [URM_album[pl_id,:]]
            rows_URM_artist += [URM_artist[pl_id,:]]
        composed_URM_ii = scipy.sparse.vstack(rows_URM_ii, 'csr')
        composed_URM_album = scipy.sparse.vstack(rows_URM_album, 'csr')
        composed_URM_artist = scipy.sparse.vstack(rows_URM_artist, 'csr')
        
        # Compute predictions for current playlist group: here we do all the smart dot products...
        # "simil_ii" are the scores for playlists in common
        # "simil_album" and "simil_artist" are scores for albums and artists (captain obvious)
        simil_ii = np.array(np.divide(TTM.dot(composed_URM_ii.transpose()).transpose().todense(), TTM.sum(axis=1).transpose()))
        simil_album = np.array(np.divide(SYM_ALBUM.dot(composed_URM_album.transpose()).transpose().todense(), SYM_ALBUM.sum(axis=1).transpose()))
        simil_artist = np.array(np.divide(SYM_ARTIST.dot(composed_URM_artist.transpose()).transpose().todense(), SYM_ARTIST.sum(axis=1).transpose()))
        #simil_title = np.array(np.divide(SYM_TITLE.dot(composed_URM_artist.transpose()).transpose().todense(), SYM_TITLE.sum(axis=1).transpose()))
        
        # Now we should consider one playlist at a time, take its own personalized parameters and make the prediction
        for i,pl_id in enumerate(pl_group.playlist_id):
            # Retrieve parameters
            ii_param = playlist_params.loc[pl_id].ii_param_norm
            album_param = playlist_params.loc[pl_id].album_param_norm
            artist_param = playlist_params.loc[pl_id].artist_param_norm
            
            # Tracks that we know are in the playlist (so we shouldn't recommend them)
            pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])

            pred = []
            
            # If you want to do some testing only on specific features, put the weights of the other features to zero.
            ii_param *= 1.5
            album_param *= 1
            artist_param *= 1.2
            
            # use best
            """if ii_param > album_param and ii_param > artist_param:
                album_param = 0
                artist_param = 0
            elif album_param > ii_param and album_param > artist_param:
                ii_param = 0
                artist_param = 0
            elif artist_param > ii_param and artist_param > album_param:
                ii_param = 0
                album_param = 0"""
            
            # Combine all the predictions and sort them from best to worst
            #print(simil_ii[i][simil_ii[i].nonzero()[0]].max())
            #print(simil_album[i][simil_album[i].nonzero()[0]].max())
            #print(simil_artist[i][simil_artist[i].nonzero()[0]].max())
            #print()
            
            simil = ii_param * simil_ii[i] + album_param * simil_album[i] + artist_param * simil_artist[i]
            #simil = simil_title[i]
            
            sorted_ind = simil.argsort()[::-1]

            # Predict...
            i = 0
            c = 0
            while i < len(sorted_ind) and c < 5:
                tr = sorted_ind[i]
                if (tr in ttracks) and (tr not in pl_tracks):
                    pred.append(num_to_tracks[tr])
                    c+=1
                i+=1
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', np.array(pred))
            
            # Update MAP
            if compute_MAP:
                correct = 0
                ap = 0
                for it, t in enumerate(pred):
                    tr_ids = test_good.loc[pl_id]['track_ids']
                    if t in tr_ids:
                        correct += 1
                        ap += correct / (it+1)
                ap /= len(pred)
                sum_ap += ap
        
        # Update "row_start" to "row_end" and proceed to next pl_group
        row_start = row_end
        
        print(row_start)
        if compute_MAP:
            print(sum_ap / row_start)
            
    #predictions['playlist_id'] = predictions['playlist_id_tmp']
    return predictions

In [6]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [7]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [None]:
# We load them just to compare the ones for testing with the original ones.
# NB: we shouldn't use them in training!
train_original = pd.read_csv('data/train_final.csv', delimiter='\t')
target_playlists_original = pd.read_csv('data/target_playlists.csv', delimiter='\t')

In [None]:
len(train), len(target_playlists), len(target_tracks)

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=1, min_playlist_tracks=13)

In [None]:
len(train), len(test), len(target_playlists), len(target_tracks)

In [None]:
full_target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
print("Number of playlists in the new target_playlists that are also in the original target_playlists")
print(len(target_playlists[target_playlists.playlist_id.isin(full_target_playlists.playlist_id)]))

# Process data

In [8]:
# Almost all of these were taken from one of your notebook, so you probably understand them
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])


train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

# Substitute each bad album (i.e. an illformed album such as -1, None, etc) with the 0 album
bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [0]
        bad_albums += 1
    return ar[0]

tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))

In [11]:
# Substitute each album with the most similar album according to playlist frequencies
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="okapi")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [12]:
tracks.tail()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags,track_id_tmp
99995,99995,293937,-1,,0,[],3022399
99996,99996,510388,-1,,0,[],1320641
99997,99997,27938,-1,,0,[],2584455
99998,99998,373892,-1,,0,[],2299706
99999,99999,567363,-1,,0,[],2739985


In [13]:
def transform_album_sim(tr_id):
    tot = np.zeros((1,max(tracks.album)+1))[0]
    for pl_id in track_playlists.loc[tr_id].playlist_ids:
        tot += np.log(UAM_album_no_norm[pl_id].toarray()[0] + 1)  
    if tot.max() != 0:
        best_1 = tot.argmax()
        best_2 = tot.argpartition(len(tot)-2)[-2]
        if best_1 == 0:
            return best_2
    return 0

corrected_albums = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if row.album == 0:
        new_album = transform_album_sim(row.track_id)
        if new_album != 0:
            tracks.set_value(row.track_id, "album", new_album)
            corrected_albums += 1
            if corrected_albums % 100 == 0:
                print(corrected_albums)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850

In [14]:
bad_albums, corrected_albums

(26756, 26737)

In [15]:
tracks.tail()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags,track_id_tmp
99995,99995,293937,-1,,102651,[],3022399
99996,99996,510388,-1,,28134,[],1320641
99997,99997,27938,-1,,47785,[],2584455
99998,99998,373892,-1,,17876,[],2299706
99999,99999,567363,-1,,152320,[],2739985


In [16]:
len(tracks[tracks.album == 0])

19

In [17]:
# Substitute each 0 album with a brand new album
def transform_album_2(alb):
    global next_album_id
    if alb == 0:
        alb = next_album_id
        next_album_id += 1
    return alb
last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

In [18]:
len(tracks[tracks.album == 0])

0

# Target playlists analysis

# Training

## II
"II" means Item-Item collaborative filtering, i.e. playlists in common...

Steps:
1 - Create a URM (URM_sqrt) normalized with a modified IDF which has a sqrt.
2 - Compute TTM as URM_sqrt.dot(URM_sqrt.transpose()). Keep the K best for each row.
3 - Compute personalized parameters for each playlist. Here we compute the ii_parameter, which indicates how much a playlist is affine to be predicted using the TTM. This is done by doing the following things for each row:
    - compute a np.array by doing the sum of all the rows in the TTM that corresponds to a track in the considered playlist
    - compute the ii_parameter of the playlist by doing 1/(entropy_of_the_computed_array + 0.05). "0.05" is needed since it may happens that the entropy is zero and so the ratio goes to infinity.

In [None]:
def sigmoid(gamma):
    if gamma < 0:
        return 1 - 1/(1 + math.exp(gamma))
    else:
        return 1/(1 + math.exp(-gamma))

In [19]:
# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no"):
    """
        possible normalizations: "no", "magnitude", "idf", "sqrt", "sigmoid", "pow", "target-idf".
        Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((500)/(nq + 0.5))
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((500)/(nq + 0.5))
            elif norm == "pow":
                URM[pl_id,track_id] = math.pow((500)/(nq + 0.5), 0.3)
            elif norm == "sigmoid":
                URM[pl_id,track_id] = sigmoid(math.pow((500)/(nq + 0.5), 0.03))
            elif norm == "target-idf":
                if pl_id in target_playlists.playlist_id:
                    URM[pl_id,track_id] = math.log(500/(nq + 0.5)) * 2
                else:
                    URM[pl_id,track_id] = math.log(500/(nq + 0.5))
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
        
    if norm == "magnitude":
        for pl_id in playlists.playlist_id:
            magnitude = math.sqrt(len(URM.data[pl_id]))
            for col in URM.rows[pl_id]:
                URM[pl_id,col] /= magnitude
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [20]:
URM_pow = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [21]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 10000
def_rows_i = URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM = dot_with_top(URM_pow.transpose(), URM_pow, def_rows_i, top=20, row_group=row_group, similarity="cosine")

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [22]:
URM_ii = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [23]:
playlist_params = pd.DataFrame(playlist_tracks.playlist_id)

In [24]:
# Step 3: compute how much each playlist is affine to be classified with such similarity concept
playlist_params['ii_param'] = 0.0

counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    trks = playlist_tracks.loc[pl_id].track_ids
    tot = np.zeros((1,TTM.shape[0]))[0]
    for tr_id in trks:
        tot += TTM[tr_id].toarray()[0]
    v = 1 / (scipy.stats.entropy(tot + 0.05))
    playlist_params.set_value(pl_id, "ii_param", v)
    counter += 1
    if counter % 500 == 0:
        print(counter)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000


In [25]:
playlist_params[playlist_params.ii_param > 0]

Unnamed: 0,playlist_id,ii_param
0,0,0.087026
2,2,0.086927
4,4,0.087698
5,5,0.087087
6,6,0.087194
8,8,0.087087
23,23,0.087058
28,28,0.087523
30,30,0.086940
38,38,0.087182


In [26]:
playlist_params[playlist_params.ii_param > 0].ii_param.describe()

count    10000.000000
mean         0.087537
std          0.001698
min          0.086889
25%          0.086974
50%          0.087136
75%          0.087570
max          0.116931
Name: ii_param, dtype: float64

## Album

<div style="white-space: pre-wrap;">
Steps:
1 - Compute the playlists_x_albums (i.e. the UAM_album matrix, where U stands for User) sparse matrix. I do this before computing the tracks_x_albums (i.e. the IAM_album matrix, where I stands for Item) sparse matrix because here I compute also the "album_to_val" dictionary, which contains the IDF value of each album obtained considering the playlists as document (and not the tracks). However at the moment I don't use this because I compute the IAM_album matrix without any normalization, so you may skip it...
2 - Compute the tracks_x_albums IAM_album sparse matrix.
3 - Compute the SYM_ALBUM tracks_x_tracks matrix by doing IAM_album.dot(IAM_album.transpose()). It's not big, so I don't need to keep the K best values...
4 - Compute the album_parameter, which means "how much each playlist is affine to album similarity". I do this by computing the entropy of the numpy array containing the occurrences of the albums in the playlist, and then doing 1/(entropy_of_array + 0.05).
</div>

In [27]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log(500/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UAM_album[pl_id,album] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [28]:
# Step 1
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="tf")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [29]:
unique_albums = tracks.album.unique()
unique_albums

array([     7,      8,      9, ..., 217324, 114441, 217344])

In [30]:
def get_IAM_album(tracks, target_tracks, norm="no", most_similar=5):
    """
        Possible norms are "no", "idf", "most-similar".
        Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_album_sim(alb, n_best=5):
            bests = []
            a = ALB_ALB_SYM[alb].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_album_sim(row.album, n_best=5)
            for it,alb in enumerate(bests):
                IAM_album[row.track_id, alb] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
            
    else:
        for row in tracks.itertuples():
            nq = 1
            if norm == "idf":
                if row.album in album_to_val:
                    IAM_album[row.track_id,row.album] = album_to_val[row.album]
                else:
                    IAM_album[row.track_id,row.album] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM_album[row.track_id,row.album] = 1
            if i % 100 == 0:
                print(i)
            i += 1
    
    return IAM_album

In [31]:
# Step 2
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [32]:
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

In [33]:
URM_album = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [34]:
# Step 4: compute how much each user is affine to album similarity
playlist_params['album_param'] = 0.0

UAM_album_no_norm_csc = UAM_album_no_norm.tocsr()
counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_album_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "album_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

5000
10000


In [35]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param
0,0,0.087026,0.335974
1,1,0.000000,0.000000
2,2,0.086927,0.501025
3,3,0.000000,0.000000
4,4,0.087698,0.269041
5,5,0.087087,0.422634
6,6,0.087194,0.305292
7,7,0.000000,0.000000
8,8,0.087087,0.471876
9,9,0.000000,0.000000


## Artist
Same steps as for Album

In [36]:
# User Artist Matrix UAM
def get_UAM(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_artists = tracks.artist_id.unique()
    
    i = 0

    UAM = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    UAM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    artist_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            art = tracks.loc[tr_id].artist_id
            UAM[pl_id,art] += 1
            UAM_no_norm[pl_id,art] += 1
            if art not in artist_to_playlists:
                artist_to_playlists[art] = [pl_id]
            else:
                artist_to_playlists[art].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    artist_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            artists = UAM.rows[pl_id]
            data = UAM.data[pl_id]
            for artist in artists:
                fq = UAM[pl_id,artist]
                nq = len(artist_to_playlists[artist])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if artist not in artist_to_val:
                    artist_to_val[artist] = idf
                
                if norm == "idf":
                    UAM[pl_id,artist] = idf
                else:
                    UAM[pl_id,artist] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM, UAM_no_norm, artist_to_val

In [37]:
# Step 1
UAM, UAM_no_norm, artist_to_val = get_UAM(tracks, playlist_tracks, target_playlists, norm="no")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [38]:
unique_artists = tracks.artist_id.unique()

In [39]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no", n_best=5):
    """
        Possible norms are "no", "idf", "most-similar". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_artist_sim(art, n_best=5):
            bests = []
            a = ART_ART_SYM[art].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_artist_sim(row.artist_id, n_best=5)
            for it,art in enumerate(bests):
                IAM[row.track_id, art] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
    else:
        for row in tracks.itertuples():
            if norm == "idf":
                if row.artist_id in artist_to_val:
                    IAM[row.track_id,row.artist_id] = artist_to_val[row.artist_id]
                else:
                    IAM[row.track_id,row.artist_id] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM[row.track_id,row.artist_id] = 1

            if i % 1000 == 0:
                print(i)
            i += 1
    
    return IAM

In [40]:
# Step 2
IAM = get_IAM(tracks, target_tracks, norm="no")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [41]:
# Step 3
SYM_ARTIST = IAM.dot(IAM.transpose())

In [42]:
URM_artist = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="idf")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [43]:
UAM_csc = UAM.tocsc()
UAM_no_norm_csc = UAM_no_norm.tocsc()
IAM_csr_transpose = IAM.tocsr().transpose()

In [44]:
# Step 4: compute how much each user is affine to artist similarity
playlist_params['artist_param'] = 0.0

counter = 0
for pl_id in playlist_tracks[playlist_tracks.playlist_id.isin(target_playlists.playlist_id)].playlist_id:
    v = 1 / (scipy.stats.entropy(UAM_no_norm_csc.getrow(pl_id).data) + 0.05)
    playlist_params.set_value(pl_id, "artist_param", v)
    counter += 1
    if counter % 5000 == 0:
        print(counter)

5000
10000


In [45]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param,artist_param
0,0,0.087026,0.335974,0.328328
1,1,0.000000,0.000000,0.000000
2,2,0.086927,0.501025,0.501025
3,3,0.000000,0.000000,0.000000
4,4,0.087698,0.269041,0.288358
5,5,0.087087,0.422634,0.494315
6,6,0.087194,0.305292,0.319708
7,7,0.000000,0.000000,0.000000
8,8,0.087087,0.471876,0.471876
9,9,0.000000,0.000000,0.000000


# Implicit

In [None]:
import implicit

In [None]:
# initialize a model
model_implicit = implicit.als.AlternatingLeastSquares(factors=100, num_threads=0, regularization=0.05)

In [None]:
URM_no_norm = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no")

In [None]:
# train the model on a sparse matrix of item/user/confidence weights
model_implicit.fit(URM_no_norm)

In [None]:
# recommend items for a user
user_items = URM_no_norm.T.tocsr()

In [None]:
np.array([tup[0] for tup in model_implicit.recommend(0, user_items)])

In [None]:
# find related items
model_implicit.similar_items(0)

# Logistic MF

In [None]:
lmf = LogisticMF(URM_no_norm.toarray(), 50)

In [None]:
lmf.train_model()

In [None]:
U,I = lmf.get_vectors()

In [None]:
import time

class LogisticMF():

    def __init__(self, counts, num_factors, reg_param=0.6, gamma=1.0,
                 iterations=30):
        self.counts = counts
        self.num_users = counts.shape[0]
        self.num_items = counts.shape[1]
        self.num_factors = num_factors
        self.iterations = iterations
        self.reg_param = reg_param
        self.gamma = gamma

    def train_model(self):

        self.ones = np.ones((self.num_users, self.num_items))
        self.user_vectors = np.random.normal(size=(self.num_users,
                                                   self.num_factors))
        self.item_vectors = np.random.normal(size=(self.num_items,
                                                   self.num_factors))
        self.user_biases = np.random.normal(size=(self.num_users, 1))
        self.item_biases = np.random.normal(size=(self.num_items, 1))

        user_vec_deriv_sum = np.zeros((self.num_users, self.num_factors))
        item_vec_deriv_sum = np.zeros((self.num_items, self.num_factors))
        user_bias_deriv_sum = np.zeros((self.num_users, 1))
        item_bias_deriv_sum = np.zeros((self.num_items, 1))
        for i in range(self.iterations):
            t0 = time.time()
            # Fix items and solve for users
            # take step towards gradient of deriv of log likelihood
            # we take a step in positive direction because we are maximizing LL
            user_vec_deriv, user_bias_deriv = self.deriv(True)
            user_vec_deriv_sum += np.square(user_vec_deriv)
            user_bias_deriv_sum += np.square(user_bias_deriv)
            vec_step_size = self.gamma / np.sqrt(user_vec_deriv_sum)
            bias_step_size = self.gamma / np.sqrt(user_bias_deriv_sum)
            self.user_vectors += vec_step_size * user_vec_deriv
            self.user_biases += bias_step_size * user_bias_deriv

            # Fix users and solve for items
            # take step towards gradient of deriv of log likelihood
            # we take a step in positive direction because we are maximizing LL
            item_vec_deriv, item_bias_deriv = self.deriv(False)
            item_vec_deriv_sum += np.square(item_vec_deriv)
            item_bias_deriv_sum += np.square(item_bias_deriv)
            vec_step_size = self.gamma / np.sqrt(item_vec_deriv_sum)
            bias_step_size = self.gamma / np.sqrt(item_bias_deriv_sum)
            self.item_vectors += vec_step_size * item_vec_deriv
            self.item_biases += bias_step_size * item_bias_deriv
            t1 = time.time()

            print('iteration %i finished in %f seconds' % (i + 1, t1 - t0))

    def deriv(self, user):
        if user:
            vec_deriv = np.dot(self.counts, self.item_vectors)
            bias_deriv = np.expand_dims(np.sum(self.counts, axis=1), 1)

        else:
            vec_deriv = np.dot(self.counts.T, self.user_vectors)
            bias_deriv = np.expand_dims(np.sum(self.counts, axis=0), 1)
        A = np.dot(self.user_vectors, self.item_vectors.T)
        A += self.user_biases
        A += self.item_biases.T
        A = np.exp(A)
        A /= (A + self.ones)
        A = (self.counts + self.ones) * A

        if user:
            vec_deriv -= np.dot(A, self.item_vectors)
            bias_deriv -= np.expand_dims(np.sum(A, axis=1), 1)
            # L2 regularization
            vec_deriv -= self.reg_param * self.user_vectors
        else:
            vec_deriv -= np.dot(A.T, self.user_vectors)
            bias_deriv -= np.expand_dims(np.sum(A, axis=0), 1)
            # L2 regularization
            vec_deriv -= self.reg_param * self.item_vectors
        return (vec_deriv, bias_deriv)

    def log_likelihood(self):
        loglik = 0
        A = np.dot(self.user_vectors, self.item_vectors.T)
        A += self.user_biases
        A += self.item_biases.T
        B = A * self.counts
        loglik += np.sum(B)

        A = np.exp(A)
        A += self.ones

        A = np.log(A)
        A = (self.counts + self.ones) * A
        loglik -= np.sum(A)

        # L2 regularization
        loglik -= 0.5 * self.reg_param * np.sum(np.square(self.user_vectors))
        loglik -= 0.5 * self.reg_param * np.sum(np.square(self.item_vectors))
        return loglik

    def get_vectors(self):
        return self.user_vectors, self.item_vectors

# Playlist titles

In [None]:
playlists.head()

In [None]:
# Count distinct title tokens
token_playlists = {}
for row in playlists.itertuples():
    for token in row.title:
        if token in token_playlists:
            token_playlists[token].append(row.playlist_id)
        else:
            token_playlists[token] = [row.playlist_id]

In [None]:
np.array(([len(l) for l in token_playlists.values()])).mean()

In [None]:
np.array(([len(l) for l in token_playlists.values()])).max()

In [None]:
# Playlist Title Matrix
def get_PTM(playlists, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    i = 0

    PTM = lil_matrix((max(playlist_tracks.playlist_id)+1, max(token_playlists)+1))
    PTM_no_norm = lil_matrix((max(playlist_tracks.playlist_id)+1, max(token_playlists)+1))
    
    for row in playlists.itertuples():
        pl_id = row.playlist_id
        for token in row.title:
            if norm == "idf":
                PTM[pl_id,token] = math.log(10000/len(token_playlists[token]))
            else:
                PTM[pl_id,token] += 1
            PTM_no_norm[pl_id,token] += 1
        i += 1
        if i % 1000 == 0:
            print(i)
    
    return PTM, PTM_no_norm

In [None]:
PTM, PTM_no_norm = get_PTM(playlists, playlist_tracks, target_playlists, norm="idf")

In [None]:
playlists.head()

In [None]:
counter = 0
def get_best_token(tr_id):
    global counter
    if tr_id in track_playlists.track_id:
        pl_ids = track_playlists.loc[tr_id]
        tot = np.zeros((1,max(token_playlists)+1))[0]
        for pl_id in pl_ids:
            try:
                tot += PTM[pl_id].toarray()[0]
            except IndexError:
                pass
        best_token = tot.argmax()
        counter += 1
        if counter % 1000 == 0:
            print(counter)
        return best_token
    else:
        return 0
    
tracks["best_title_token"] = tracks.track_id.apply(lambda tr_id: get_best_token(tr_id))

In [None]:
# Item Title Matrix
def get_ITM_title(tracks, target_tracks, norm="no", n_best=5):
    """
        Possible norms are "no". Default to "no".
    """
    unique_tokens = tracks.best_title_token.unique()
    ITM_title = lil_matrix((len(tracks), max(unique_tokens)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        ITM_title[row.track_id,row.best_title_token] = 1

        if i % 1000 == 0:
            print(i)
        i += 1
    
    return ITM_title

In [None]:
# Step 2
ITM_title = get_ITM_title(tracks, target_tracks, norm="no")

In [None]:
# Step 3
SYM_TITLE = ITM_title.dot(ITM_title.transpose())

## Adjust params

Here I just want to adjust the parameters so that they can be compaired... I do zeta scoring to them and put the means to 1. We should do some more experiments here.

In [46]:
# Execute only once
playlist_params_copy = playlist_params.copy(deep=True)

In [47]:
# Restore playlist_params
playlist_params = playlist_params_copy.copy(deep=True)

In [48]:
playlist_params

Unnamed: 0,playlist_id,ii_param,album_param,artist_param
0,0,0.087026,0.335974,0.328328
1,1,0.000000,0.000000,0.000000
2,2,0.086927,0.501025,0.501025
3,3,0.000000,0.000000,0.000000
4,4,0.087698,0.269041,0.288358
5,5,0.087087,0.422634,0.494315
6,6,0.087194,0.305292,0.319708
7,7,0.000000,0.000000,0.000000
8,8,0.087087,0.471876,0.471876
9,9,0.000000,0.000000,0.000000


In [49]:
playlist_params["ii_param_norm"] = playlist_params.ii_param
playlist_params["album_param_norm"] = playlist_params.album_param
playlist_params["artist_param_norm"] = playlist_params.artist_param

In [50]:
#playlist_params["ii_param_norm"] = np.sqrt(playlist_params.ii_param_norm)
#playlist_params["album_param_norm"] = np.sqrt(playlist_params.album_param_norm)
#playlist_params["artist_param_norm"] = np.sqrt(playlist_params.artist_param_norm)

In [51]:
playlist_params.describe()

Unnamed: 0,playlist_id,ii_param,album_param,artist_param,ii_param_norm,album_param_norm,artist_param_norm
count,45649.0,45649.0,45649.0,45649.0,45649.0,45649.0,45649.0
mean,28495.440382,0.019176,0.116945,0.320269,0.019176,0.116945,0.320269
std,16694.122188,0.036215,0.764502,2.141473,0.036215,0.764502,2.141473
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14066.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,28229.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,43446.0,0.0,0.0,0.0,0.0,0.0,0.0
max,57559.0,0.116931,20.0,20.0,0.116931,20.0,20.0


In [52]:
ii_mean = playlist_params.iloc[playlist_params.ii_param_norm.nonzero()[0]].ii_param_norm.mean()
ii_std = playlist_params.iloc[playlist_params.ii_param_norm.nonzero()[0]].ii_param_norm.std()
album_mean = playlist_params.iloc[playlist_params.album_param_norm.nonzero()[0]].album_param_norm.mean()
album_std = playlist_params.iloc[playlist_params.album_param_norm.nonzero()[0]].album_param_norm.std()
artist_mean = playlist_params.iloc[playlist_params.artist_param_norm.nonzero()[0]].artist_param_norm.mean()
artist_std = playlist_params.iloc[playlist_params.artist_param_norm.nonzero()[0]].artist_param_norm.std()
ii_mean, ii_std, album_mean, album_std, artist_mean, artist_std

(0.08753707739270432,
 0.0016979150669284993,
 0.5338414526506273,
 1.563856181983162,
 1.4619949035539885,
 4.389359678757527)

In [53]:
playlist_params["ii_param_norm"] = ((playlist_params.ii_param_norm - ii_mean) / ii_std) + 1
playlist_params["album_param_norm"] = ((playlist_params.album_param_norm - album_mean) / album_std) + 1
playlist_params["artist_param_norm"] = ((playlist_params.artist_param_norm - artist_mean) / artist_std) + 1

In [54]:
playlist_params["ii_param_norm"] = playlist_params.ii_param_norm.clip(0)
playlist_params["album_param_norm"] = playlist_params.album_param_norm.clip(0)
playlist_params["artist_param_norm"] = playlist_params.artist_param_norm.clip(0)

In [55]:
#playlist_params.album_param_norm -= 0.2
#playlist_params.artist_param_norm -= 0.4

In [56]:
pl_param_target = playlist_params[playlist_params.playlist_id.isin(target_playlists.playlist_id)]
pl_param_target

Unnamed: 0,playlist_id,ii_param,album_param,artist_param,ii_param_norm,album_param_norm,artist_param_norm
0,0,0.087026,0.335974,0.328328,0.699287,0.873475,0.741724
2,2,0.086927,0.501025,0.501025,0.640841,0.979015,0.781068
4,4,0.087698,0.269041,0.288358,1.094895,0.830674,0.732618
5,5,0.087087,0.422634,0.494315,0.734657,0.928889,0.779540
6,6,0.087194,0.305292,0.319708,0.798007,0.853855,0.739760
8,8,0.087087,0.471876,0.471876,0.734848,0.960376,0.774427
23,23,0.087058,0.346835,0.346835,0.717736,0.880420,0.745940
28,28,0.087523,0.276414,0.264623,0.991902,0.835389,0.727210
30,30,0.086940,0.501025,0.501025,0.648635,0.979015,0.781068
38,38,0.087182,0.333527,0.329421,0.790871,0.871910,0.741973


In [57]:
pl_param_target.describe()

Unnamed: 0,playlist_id,ii_param,album_param,artist_param,ii_param_norm,album_param_norm,artist_param_norm
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,28572.0285,0.087537,0.533841,1.461995,1.0,1.0,1.0
std,16690.064311,0.001698,1.563856,4.38936,1.0,1.0,1.0
min,0.0,0.086889,0.195599,0.191345,0.618248,0.783712,0.710516
25%,14216.75,0.086974,0.286138,0.298617,0.668544,0.841607,0.734955
50%,28432.5,0.087136,0.362575,0.380251,0.763988,0.890485,0.753553
75%,43418.75,0.08757,0.477739,0.511207,1.019309,0.964126,0.783388
max,57559.0,0.116931,20.0,20.0,18.311604,13.447538,5.223396


In [58]:
# ii_param wins against other parameters
len(pl_param_target[(pl_param_target.ii_param_norm >= pl_param_target.album_param_norm) & (pl_param_target.ii_param_norm >= pl_param_target.artist_param_norm)])

3620

In [59]:
# album_param wins
len(pl_param_target[(pl_param_target.album_param_norm >= pl_param_target.ii_param_norm) & (pl_param_target.album_param_norm >= pl_param_target.artist_param_norm)])

5872

In [60]:
# artist_param wins
len(pl_param_target[(pl_param_target.artist_param_norm >= pl_param_target.album_param_norm) & (pl_param_target.artist_param_norm >= pl_param_target.ii_param_norm)])

508

# Analyse matrices

# Predictions

In [None]:
# Predictions for all the playlists in test
make_predictions(test=test, target_playlists=target_playlists, compute_MAP=True, row_group=1000)

In [67]:
predictions = make_predictions(test=train, target_playlists=target_playlists, compute_MAP=False, row_group=1000)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [68]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [69]:
predictions = predictions.drop("playlist_id_tmp", axis=1)

In [70]:
predictions.head()

Unnamed: 0_level_0,playlist_id,track_ids
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1
30680,10024884,"[1637241, 3738046, 2228906, 2799999, 3559096]"
37046,10624787,"[3166806, 3779369, 1980691, 2016284, 1788063]"
4069,4891851,"[1854954, 2238571, 2089117, 1371741, 1406862]"
24489,4267369,"[1890831, 106902, 2504992, 3004539, 1712316]"
8513,65078,"[1742595, 1455989, 3037073, 431268, 2611428]"


In [71]:
# Make the dataframe friendly for output -> convert np.array in string
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))
predictions.to_csv('results.csv', index=False)