In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

#from recsys.utility import *

#RANDOM_STATE = 666

#np.random.seed(RANDOM_STATE)

%matplotlib inline

In [2]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7):
    """
        Standard train_test_split, no modifications.
    """
    playlists = train[train.playlist_id.isin(target_playlists_original.playlist_id)].groupby('playlist_id').count()

    # Only playlists with at least "min_playlist_tracks" tracks are considered.
    # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
    to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values


    # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
    # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
    # It's a numpy array that contains playlis_ids.
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
        # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
        selected_df = train[train['playlist_id'] == p].sample(5)

        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)

    test = train.loc[indexes].copy()
    train = train.drop(indexes)

    return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [3]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0):
    """
        Produces the product between matrices m1 and m2.
        Possible similarities: "dot", "cosine". By default it goes on "dot".
        NB: Shrinkage is not implemented...
        Code taken from
            https://stackoverflow.com/questions/29647326/sparse-matrix-dot-product-keeping-only-n-max-values-per-result-row
            and optimized for smart dot products.
    """
    m2_transposed = m2.transpose()
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                else:
                    res_rows = rows.dot(m2)
                if shrinkage > 0:
                    res_rows = apply_shrinkage(rows, res_rows, shrinkage)
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            final_rows.append(def_rows_g[0])
                else:
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [4]:
def make_predictions(test=None, target_playlists=None, urm=None,
                     similarities=[], playlist_params_list=None, merger=lambda x,f: x[0],
                     compute_MAP=False, row_group=100, verbose=False):
    """
        Produces a prediction dataframe for "test", where each row corresponds to a playlist in "target_playlists".
        If compute_MAP is true, then it print the MAP every "row_group" playlists.
        It's optimized for doing dot products for different playlist at once.
            "row_group" is the number of playlists in each of these optimized dot products.
            The higher is row_group, the faster are the predictions but more memory is used.
    """
    # Create predictions dataframe
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
    ttracks = set(target_tracks['track_id'].values)
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
        print(len(test_good))
    
    # This is the sum of all the AP of the playlists.
    # When we print the MAP, we divide "sum_ap" by the number of considered playlists.
    sum_ap = 0
    
    # Let's start the predictions!
    row_start = 0
    while row_start < len(target_playlists):
        # We'll do dot products for all playlists in "target_playlists" from "row_start" to "row_end"
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]
        
        # Now we need to build a matrix where, for each playlist in "pl_group", we take the correspondent URM row slice
        rows_URM = []
        for pl_id in pl_group.playlist_id:
            rows_URM += [urm[pl_id,:]]
        composed_URM = scipy.sparse.vstack(rows_URM, 'csr')
        
        # Compute predictions for current playlist group: here we do all the smart dot products...
        simil_ar = []
        for SYM in similarities:
            simil_ar.append(np.array(np.divide(SYM.dot(composed_URM.transpose()).transpose().todense(), (SYM.sum(axis=1).transpose() + 1))))
            #simil_ar.append(np.array(SYM.dot(composed_URM.transpose()).transpose().todense()))
         
        # Now we should consider one playlist at a time, take its own personalized parameters and make the prediction
        for i,pl_id in enumerate(pl_group.playlist_id):
            # Tracks that we know are in the playlist (so we shouldn't recommend them)
            pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])
            
            preds = []
            for playlist_params in playlist_params_list:
                # Retrieve parameters
                params = []
                for it,SYM in enumerate(similarities):
                    params.append(playlist_params.loc[pl_id]["param_" + str(it)])

                simil = params[0] * simil_ar[0][i]
                for p in range(1,len(simil_ar)):
                    simil += params[p] * simil_ar[p][i]
                sorted_ind = simil.argsort()[::-1]

                # Predict...  
                pred = []
                i = 0
                while i < len(sorted_ind) and len(pred) < 5:
                    tr = sorted_ind[i]
                    if (tr in ttracks) and (tr not in pl_tracks) and (num_to_tracks[tr] not in pred):
                        pred.append(num_to_tracks[tr])
                    i+=1
                preds.append(pred)
            
            pred = merger(preds, f=lambda x: x)[:5]
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', np.array(pred))
            
            # Update MAP
            if compute_MAP:
                correct = 0
                ap = 0
                for it, t in enumerate(pred):
                    tr_ids = test_good.loc[pl_id]['track_ids']
                    if t in tr_ids:
                        correct += 1
                        ap += correct / (it+1)
                ap /= len(pred)
                sum_ap += ap
        
        # Update "row_start" to "row_end" and proceed to next pl_group
        row_start = row_end
        
        print(row_start)
        if compute_MAP:
            print(sum_ap / row_start)
            
    return predictions

In [5]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [6]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [None]:
# We load them just to compare the ones for testing with the original ones.
# NB: we shouldn't use them in training!
train_original = pd.read_csv('data/train_final.csv', delimiter='\t')
target_playlists_original = pd.read_csv('data/target_playlists.csv', delimiter='\t')

In [None]:
len(train), len(target_playlists), len(target_tracks)

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=1, min_playlist_tracks=13)

In [None]:
len(train), len(test), len(target_playlists), len(target_tracks)

# Process data

In [7]:
# Almost all of these were taken from one of your notebook, so you probably understand them
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])

train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

# Substitute each bad album (i.e. an illformed album such as -1, None, etc) with the 0 album
bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [0]
        bad_albums += 1
    return ar[0]

tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))

## Recover albums

In [None]:
# Substitute each album with the most similar album according to playlist frequencies
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="idf")

In [None]:
tracks.tail()

In [None]:
def transform_album_sim(tr_id):
    tot = np.zeros((1,max(tracks.album)+1))[0]
    for pl_id in track_playlists.loc[tr_id].playlist_ids:
        ar = UAM_album_no_norm[pl_id].toarray()[0]
        tot += np.log(ar + 1)  
        #tot += ar.clip(max=1)
    if tot.max() != 0:
        best_1 = tot.argmax()
        best_2 = tot.argpartition(len(tot)-2)[-2]
        if best_1 == 0:
            return best_2
    return 0

corrected_albums = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if row.album == 0:
        new_album = transform_album_sim(row.track_id)
        if new_album != 0:
            tracks.set_value(row.track_id, "album", new_album)
            corrected_albums += 1
            if corrected_albums % 100 == 0:
                print(corrected_albums)

In [None]:
bad_albums, corrected_albums

In [None]:
tracks.tail()

In [None]:
len(tracks[tracks.album == 0])

In [8]:
# Substitute each 0 album with a brand new album
def transform_album_2(alb):
    global next_album_id
    if alb == 0:
        alb = next_album_id
        next_album_id += 1
    return alb
last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

In [9]:
len(tracks[tracks.album == 0])

0

## Recover tags

In [10]:
# Count distinct tags
tag_tracks = {}
for row in tracks.itertuples():
    for tag in row.tags:
        if tag in tag_tracks:
            tag_tracks[tag].append(row.track_id)
        else:
            tag_tracks[tag] = [row.track_id]

In [11]:
# User Tag Matrix UTM
def get_UTM(tracks, playlist_tracks, tag_tracks, norm="no", OKAPI_K=1.7, OKAPI_B=0.75, best_tag=False):
    """
        Possible norm are "no", "okapi", "idf", "tf". Default to "no".
    """
    
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    
    i = 0

    UTM = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    UTM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            tr_row = tracks.loc[tr_id]
            if best_tag:
                UTM[pl_id,tr_row.best_tag] += 1
                UTM_no_norm[pl_id,tr_row.best_tag] += 1
            else:
                for tag in tr_row.tags:
                    UTM[pl_id,tag] += 1
                    UTM_no_norm[pl_id,tag] += 1
                
        i += 1
        if i % 1000 == 0:
            print(i)
            
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = sum(list(map(lambda l: sum(l), UTM.data)))/len(UTM.data)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            tags = UTM.rows[pl_id]
            data = UTM.data[pl_id]
            for tag in tags:
                fq = UTM[pl_id,tag]
                if best_tag:
                    nq = len(best_tag_tracks[tag])
                else:
                    nq = len(tag_tracks[tag])
                idf = math.log(28000/(nq + 0.5))
                
                if norm == "idf":
                    UTM[pl_id,tag] = idf
                elif norm == "okapi":
                    UTM[pl_id,tag] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UTM[pl_id,tag] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                    
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UTM, UTM_no_norm

In [12]:
UTM, UTM_no_norm = get_UTM(tracks, playlist_tracks, tag_tracks, norm="okapi", best_tag=False)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [13]:
def get_tags_sim(tr_id):
    tot = csr_matrix((1,max(tag_tracks)+1))
    tr_row = track_playlists.loc[tr_id]
    for pl_id in tr_row.playlist_ids:
        tot += UTM[pl_id]
    tot = tot.toarray()[0]
    return tot.argsort()[::-1][0:5]
    

corrected_tags = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if len(row.tags) == 0:
        new_tags = get_tags_sim(row.track_id)
        tracks.set_value(row.track_id, "tags", new_tags)
        
        corrected_tags += 1
        if corrected_tags % 100 == 0:
            print(corrected_tags)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700


In [14]:
tracks.tail()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags,track_id_tmp
99995,99995,293937,-1,,244077,"[67047, 66729, 157349, 57103, 155309]",3022399
99996,99996,510388,-1,,244078,"[115859, 112283, 70625, 157349, 76913]",1320641
99997,99997,27938,-1,,244079,"[97371, 115684, 186445, 135532, 56410]",2584455
99998,99998,373892,-1,,244080,"[266268, 32618, 46869, 32660, 183258]",2299706
99999,99999,567363,-1,,244081,"[94940, 43371, 32473, 112283, 94962]",2739985


# Training

## II
"II" means Item-Item collaborative filtering, i.e. playlists in common...

Steps:
1 - Create a URM (URM_sqrt) normalized with a modified IDF which has a sqrt.
2 - Compute TTM as URM_sqrt.dot(URM_sqrt.transpose()). Keep the K best for each row.
3 - Compute personalized parameters for each playlist. Here we compute the ii_parameter, which indicates how much a playlist is affine to be predicted using the TTM. This is done by doing the following things for each row:
    - compute a np.array by doing the sum of all the rows in the TTM that corresponds to a track in the considered playlist
    - compute the ii_parameter of the playlist by doing 1/(entropy_of_the_computed_array + 0.05). "0.05" is needed since it may happens that the entropy is zero and so the ratio goes to infinity.

In [15]:
def sigmoid(gamma):
    if gamma < 0:
        return 1 - 1/(1 + math.exp(gamma))
    else:
        return 1/(1 + math.exp(-gamma))

In [16]:
# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no", pow_base=500, pow_exp=0.15):
    """
        possible normalizations: "no", "magnitude", "idf", "sqrt", "sigmoid", "pow", "target-idf".
        Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((500)/nq)
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((500)/nq)
            elif norm == "pow":
                URM[pl_id,track_id] = math.pow((pow_base)/nq, pow_exp)
            elif norm == "sigmoid":
                URM[pl_id,track_id] = sigmoid(math.pow((500)/(nq + 0.5), 0.03))
            elif norm == "target-idf":
                if pl_id in target_playlists.playlist_id:
                    URM[pl_id,track_id] = math.log(500/(nq + 0.5)) * 2
                else:
                    URM[pl_id,track_id] = math.log(500/(nq + 0.5))
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
        
    if norm == "magnitude":
        for pl_id in playlists.playlist_id:
            magnitude = math.sqrt(len(URM.data[pl_id]))
            for col in URM.rows[pl_id]:
                URM[pl_id,col] /= magnitude
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [17]:
URM_pow = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow", pow_base=500, pow_exp=0.15)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [18]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 10000
def_rows_i = csr_matrix((row_group, URM_pow.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM = dot_with_top(URM_pow.transpose(), URM_pow, def_rows_i, top=25, row_group=row_group, similarity="cosine")

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


## Album

<div style="white-space: pre-wrap;">
Steps:
1 - Compute the playlists_x_albums (i.e. the UAM_album matrix, where U stands for User) sparse matrix. I do this before computing the tracks_x_albums (i.e. the IAM_album matrix, where I stands for Item) sparse matrix because here I compute also the "album_to_val" dictionary, which contains the IDF value of each album obtained considering the playlists as document (and not the tracks). However at the moment I don't use this because I compute the IAM_album matrix without any normalization, so you may skip it...
2 - Compute the tracks_x_albums IAM_album sparse matrix.
3 - Compute the SYM_ALBUM tracks_x_tracks matrix by doing IAM_album.dot(IAM_album.transpose()). It's not big, so I don't need to keep the K best values...
4 - Compute the album_parameter, which means "how much each playlist is affine to album similarity". I do this by computing the entropy of the numpy array containing the occurrences of the albums in the playlist, and then doing 1/(entropy_of_array + 0.05).
</div>

In [19]:
unique_albums = tracks.album.unique()
unique_albums

array([     7,      8,      9, ..., 244079, 244080, 244081])

In [20]:
album_tracks = {}
for row in tracks.itertuples():
    if row.album in album_tracks:
        album_tracks[row.album].append(row.track_id)
    else:
        album_tracks[row.album] = [row.track_id]

In [21]:
def get_IAM_album(tracks, target_tracks, norm="no", most_similar=5):
    """
        Possible norms are "no", "idf", "most-similar".
        Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_album_sim(alb, n_best=5):
            bests = []
            a = ALB_ALB_SYM[alb].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_album_sim(row.album, n_best=5)
            for it,alb in enumerate(bests):
                IAM_album[row.track_id, alb] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
            
    else:
        for row in tracks.itertuples():
            nq = 1
            if norm == "idf":
                nq = len(album_tracks[row.album])
                if row.album in album_to_val:
                    IAM_album[row.track_id,row.album] = math.log(500/(nq + 0.5))
                else:
                    IAM_album[row.track_id,row.album] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM_album[row.track_id,row.album] = 1
            if i % 100 == 0:
                print(i)
            i += 1
    
    return IAM_album

In [22]:
# Step 2
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [23]:
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

## Artist
Same steps as for Album

In [24]:
unique_artists = tracks.artist_id.unique()

In [25]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no", n_best=5):
    """
        Possible norms are "no", "idf", "most-similar". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_artist_sim(art, n_best=5):
            bests = []
            a = ART_ART_SYM[art].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_artist_sim(row.artist_id, n_best=5)
            for it,art in enumerate(bests):
                IAM[row.track_id, art] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
    else:
        for row in tracks.itertuples():
            if norm == "idf":
                if row.artist_id in artist_to_val:
                    IAM[row.track_id,row.artist_id] = artist_to_val[row.artist_id]
                else:
                    IAM[row.track_id,row.artist_id] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM[row.track_id,row.artist_id] = 1

            if i % 1000 == 0:
                print(i)
            i += 1
    
    return IAM

In [26]:
# Step 2
IAM = get_IAM(tracks, target_tracks, norm="no")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [27]:
# Step 3
SYM_ARTIST = IAM.dot(IAM.transpose())

# Tags

In [28]:
# Item Tag Matrix ITM
def get_ITM(tracks, tag_tracks, norm="no", best_tag=False):
    """
        Possible norm are "no", "sqrt", okapi". Default to "no".
    """
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    ITM = lil_matrix((len(tracks), max(unique_tags)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if best_tag:
        tag_dict = best_tag_tracks
    else:
        tag_dict = tag_tracks
        
    for tag,track_ids in tag_dict.items():
        nq = len(track_ids)
        for track_id in track_ids:
            if norm == "okapi":
                ITM[track_id,tag] = math.log((num_tracks - nq + 0.5)/(nq + 0.5))
            elif norm == "sqrt":
                ITM[track_id,tag] = math.sqrt((num_tracks - nq + 0.5)/(nq + 0.5))
            else:
                ITM[track_id,tag] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return ITM

In [29]:
ITM = get_ITM(tracks, tag_tracks, norm="no", best_tag=False)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000


In [30]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, ITM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_TAG = dot_with_top(ITM, ITM.transpose(), def_rows_i, top=25, row_group=row_group, similarity="cosine")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


# Other similarities...

In [31]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log(500/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UAM_album[pl_id,album] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [32]:
# Step 1
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [33]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, UAM_album.shape[0]))#IAM_album[0:row_group].dot(UAM_album.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ALBUM = dot_with_top(IAM_album, UAM_album.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


In [34]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ALBUM.shape[0]))#TR_PL_ALBUM[0:row_group].dot(TR_PL_ALBUM.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ALBUM_COMPLEX = dot_with_top(TR_PL_ALBUM, TR_PL_ALBUM.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


In [35]:
# User Artist Matrix UAM
def get_UAM(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_artists = tracks.artist_id.unique()
    
    i = 0

    UAM = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    UAM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    artist_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            art = tracks.loc[tr_id].artist_id
            UAM[pl_id,art] += 1
            UAM_no_norm[pl_id,art] += 1
            if art not in artist_to_playlists:
                artist_to_playlists[art] = [pl_id]
            else:
                artist_to_playlists[art].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    artist_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            artists = UAM.rows[pl_id]
            data = UAM.data[pl_id]
            for artist in artists:
                fq = UAM[pl_id,artist]
                nq = len(artist_to_playlists[artist])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if artist not in artist_to_val:
                    artist_to_val[artist] = idf
                
                if norm == "idf":
                    UAM[pl_id,artist] = idf
                else:
                    UAM[pl_id,artist] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM, UAM_no_norm, artist_to_val

In [36]:
# Step 1
UAM, UAM_no_norm, artist_to_val = get_UAM(tracks, playlist_tracks, target_playlists, norm="no")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000


In [37]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, UAM.shape[0]))#IAM[0:row_group].dot(UAM.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ARTIST = dot_with_top(IAM, UAM.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


In [38]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ARTIST.shape[0]))#TR_PL_ARTIST[0:row_group].dot(TR_PL_ARTIST.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ARTIST_COMPLEX = dot_with_top(TR_PL_ARTIST, TR_PL_ARTIST.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


# Prepare matrices for schwiftyness!
Yoooo we're gonna get schwiftyyyyy

1,0.3,0.2 -> 0.077

In [39]:
URM_pow = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow", pow_base=500, pow_exp=0.15)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [40]:
def print_URM(urm):
    file = open("test1/tracks_in_playlist.txt","w")
    
    urm_coo = urm.tocoo()

    rows = urm_coo.row;
    cols = urm_coo.col;
    data = urm_coo.data;

    file.write("{0} {1}\n".format(urm_coo.shape[0], urm_coo.shape[1]))

    for i in range(0,len(rows)):
        file.write("{0} ".format(rows[i]))
    file.write("\n")

    for i in range(0,len(rows)):
        file.write("{0} ".format(cols[i]))
    file.write("\n")

    for i in range(0,len(rows)):
        file.write("{0} ".format(data[i]))
    file.write("\n")
    
    for pl_id in target_playlists.playlist_id:
        file.write("{0} ".format(pl_id))
    file.write("\n")

    file.close() 

In [41]:
print_URM(URM_pow)

In [42]:
def print_similarity(matrix_to_print, number):
    matrix_to_print = matrix_to_print.tocoo()

    file = open("test1/similarity_" + str(number) + ".txt","w") 

    rows = matrix_to_print.row;
    cols = matrix_to_print.col;
    data = matrix_to_print.data;

    file.write("{0} {1}\n".format(matrix_to_print.shape[0], matrix_to_print.shape[1]))

    for i in range(0,len(rows)):
        file.write("{0} ".format(rows[i]))
    file.write("\n")

    for i in range(0,len(cols)):
        file.write("{0} ".format(cols[i]))
    file.write("\n")

    for i in range(0,len(data)):
        file.write("{0} ".format(data[i]))
    file.write("\n")

    file.close() 

In [43]:
print_similarity(TTM, 0)

In [44]:
print_similarity(SYM_ALBUM, 1)

In [45]:
print_similarity(SYM_ARTIST, 2)

In [46]:
print_similarity(SYM_TAG, 3)

In [47]:
print_similarity(SYM_ALBUM_COMPLEX, 4)

In [48]:
print_similarity(SYM_ARTIST_COMPLEX, 5)

In [49]:
from pandas import Series

def load_playlist_params(location, params_bitmask):
    content = None
    with open(os.path.join(location, 'playlist_params.txt'), 'r') as f:
        content = f.readlines()

    playlist_params = pd.DataFrame(playlists.playlist_id)
    p = 0
    for it,ch in enumerate(params_bitmask):
        param_name = "param_" + str(it)
        if ch == "1":
            p_list = list(map(float, content[p].strip().split(' ')))
            playlist_params[param_name] = Series(data=p_list, index=playlist_params.index)
            p += 1
        else:
            playlist_params[param_name] = 0

    return playlist_params

In [50]:
from subprocess import call

In [61]:
call(["./get_schwifty", "test1", "111000", "adadelta", "2000", "0.9", "0.3", "100"])

0

In [62]:
playlist_params1 = load_playlist_params("test1", "111000")
playlist_params1[playlist_params1.playlist_id.isin(target_playlists.playlist_id)]

Unnamed: 0,playlist_id,param_0,param_1,param_2,param_3,param_4,param_5
0,0,2.51504,2.50425,2.50425,0,0,0
2,2,4.48829,3.29339,3.29339,0,0,0
4,4,4.31583,2.56116,6.44029,0,0,0
5,5,3.91315,4.94402,8.51255,0,0,0
6,6,4.56661,3.52821,4.38447,0,0,0
8,8,8.81101,4.00308,8.49834,0,0,0
23,23,2.77789,2.42820,2.42820,0,0,0
28,28,10.58760,5.19836,5.19836,0,0,0
30,30,4.32950,3.00134,3.00134,0,0,0
38,38,5.63965,5.18385,5.71700,0,0,0


In [None]:
call(["./get_schwifty", "test1", "100011", "adadelta", "500", "0.9", "0.25", "100"])

In [None]:
playlist_params2 = load_playlist_params("test1", "100011")
playlist_params2[playlist_params2.playlist_id.isin(target_playlists.playlist_id)]

TTM: 0.057
ALBUM: 0.0606
ARTIST: 0.0600
TAG: 0.03
ALBUM_COMPLEX:  0.0568
ARTIST_COMPLEX: 0.0548

111000 : 0.0796
100011 : 0.0765
[111000, 100011] : 

In [53]:
def merge_predictions_scoring(predictions, f=lambda x: x):
    ar_len = len(predictions[0])
    acc = np.array([predictions[0],[f(i) for i in range(0,ar_len)]])
    for pred in predictions[1:]:
        other_pred = np.array([pred,[f(i) for i in range(0,ar_len)]])
        new_acc_0 = []
        new_acc_1 = []
        for i1,v1 in enumerate(acc[0]):
            for i2,v2 in enumerate(other_pred[0]):
                if v1 == v2:
                    new_acc_0.append(v1)
                    new_acc_1.append(acc[1][i1] + other_pred[1][i2])
        for i1,v1 in enumerate(acc[0]):
            if v1 not in new_acc_0:
                new_acc_0.append(v1)
                new_acc_1.append(acc[1][i1] + f(ar_len))
        for i2,v2 in enumerate(other_pred[0]):
            if v2 not in new_acc_0:
                new_acc_0.append(v2)
                new_acc_1.append(other_pred[1][i2] + f(ar_len))
        
        acc = np.array([new_acc_0, new_acc_1])
    acc_arg_sorted = acc[1].argsort()
    res = []
    for arg in acc_arg_sorted:
        res.append(acc[0][arg])
    return res

def merge_predictions_intersect_first(predictions, f=lambda x: x):
    if len(predictions) == 1:
        return predictions[0]
    
    intersect = np.intersect1d(predictions[0], predictions[1])
    for i in range(2,len(predictions)):
        intersect = np.intersect1d(intersect, predictions[i])
        
    res = []
    for t in intersect:
        res.append(t)
    
    for t in predictions[0]:
        if t not in res:
            res.append(t)
    
    return res

def merge_predictions_ordered_simple(predictions, f=lambda x: x):
    if len(predictions) == 1:
        return predictions[0]
    
    first = predictions[0]
    second = predictions[1]
    
    return [first[0], first[1], first[2], first[3], second[2]]

In [None]:
"""
    Predictions for all the playlists in test.
    mult:
        0 - ii
        1 - album
        2 - artist
        3 - title
    good values: 1.5, 1, 1.2, 0, ii pow, album no_norm, artist idf
"""

merger = merge_predictions_ordered_simple
similarities = [TTM, SYM_ALBUM, SYM_ARTIST, SYM_TAG, SYM_ALBUM_COMPLEX, SYM_ARTIST_COMPLEX]
playlist_params_list = [playlist_params1]

make_predictions(test=test, target_playlists=target_playlists, urm=URM_pow,
                 similarities=similarities, playlist_params_list=playlist_params_list, merger=merger,
                 compute_MAP=True, row_group=1000, verbose=False)

In [63]:
merger = merge_predictions_ordered_simple
similarities = [TTM, SYM_ALBUM, SYM_ARTIST, SYM_TAG, SYM_ALBUM_COMPLEX, SYM_ARTIST_COMPLEX]
playlist_params_list = [playlist_params1]

predictions = make_predictions(test=train, target_playlists=target_playlists, urm=URM_pow,
                               similarities=similarities, playlist_params_list=playlist_params_list, merger=merger,
                               compute_MAP=False, row_group=1000, verbose=False)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [64]:
predictions

Unnamed: 0_level_0,playlist_id,playlist_id_tmp,track_ids
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30680,30680,10024884,"[1637241, 2228906, 3738046, 2799999, 3559096]"
37046,37046,10624787,"[3779369, 3166806, 2016284, 1083593, 1788063]"
4069,4069,4891851,"[1854954, 1371741, 1406862, 232126, 2238571]"
24489,24489,4267369,"[3004539, 2504992, 1712316, 2740798, 1890831]"
8513,8513,65078,"[1742595, 1455989, 119305, 2913000, 290972]"
309,309,10637124,"[2340644, 2123901, 1120194, 414791, 933471]"
24983,24983,3223162,"[1269244, 1117784, 221825, 2787106, 1672852]"
16810,16810,7541503,"[956454, 2805304, 2086779, 661053, 866218]"
6324,6324,6189367,"[495158, 1675280, 1195554, 2019507, 1054050]"
20718,20718,8459943,"[1654875, 2821213, 2665275, 1732579, 2752335]"


In [65]:
pr_copy = predictions.copy(deep=True)

In [66]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [67]:
predictions = predictions.drop("playlist_id_tmp", axis=1)

In [68]:
predictions.head()

Unnamed: 0_level_0,playlist_id,track_ids
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1
30680,10024884,"[1637241, 2228906, 3738046, 2799999, 3559096]"
37046,10624787,"[3779369, 3166806, 2016284, 1083593, 1788063]"
4069,4891851,"[1854954, 1371741, 1406862, 232126, 2238571]"
24489,4267369,"[3004539, 2504992, 1712316, 2740798, 1890831]"
8513,65078,"[1742595, 1455989, 119305, 2913000, 290972]"


In [69]:
# Make the dataframe friendly for output -> convert np.array in string
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))
predictions.to_csv('results.csv', index=False)

In [None]:
def load_similarity(location):
    row = []
    col = []
    data = []
    content = None
    with open(os.path.join(location, 'similarity_bpr.txt'), 'r') as f:
        content = f.readlines()

    row = list(map(int, content[1].strip().split(' ')))
    col = list(map(int, content[2].strip().split(' ')))
    data = list(map(float, content[3].strip().split(' ')))

    coo = coo_matrix((data, (row, col)), shape=(100000, 100000))
    csr = coo.tocsr()

    return csr