In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sc
from sklearn import preprocessing
from sklearn import model_selection
import functools

from recsys.preprocess import *
from recsys.utility import *
from scipy.sparse import *

from sklearn.neighbors import NearestNeighbors

RANDOM_STATE = 2342

%matplotlib inline

In [2]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')
tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
tracks.index = tracks.track_id

#target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
#target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')

In [3]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=0.3, min_playlist_tracks=8)

In [4]:
tracks_in_playlist = get_playlist_track_list2(train)

In [184]:
playlists_of_track = pd.DataFrame(train['track_id'].drop_duplicates())
playlists_of_track.index = train['track_id'].unique()
playlists_of_track['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
playlists_of_track

Unnamed: 0,track_id,playlist_ids
2801526,2801526,"[3271849, 6167325, 11692735, 8212417, 3438766,..."
727878,727878,"[5616275, 7892011, 5517473, 3504940, 8753109, ..."
2805283,2805283,"[11267488, 5927326, 11764851, 3299043, 3376358..."
1515105,1515105,"[10103900, 10111520, 3376560, 5030066, 8608218..."
2945623,2945623,"[3836898, 10405417, 318126, 6333704, 11742401,..."
2821391,2821391,"[5270369, 3790731, 9001472, 8228631, 4325130, ..."
1166185,1166185,"[3794808, 3793858, 8201204, 5440256, 3351165, ..."
2498280,2498280,"[7908370, 5486243, 8574412, 165862, 7656441, 2..."
282687,282687,"[11460733, 3538162, 6077589, 4206718, 10663661..."
676462,676462,"[5758965, 3874112, 6847844, 5849106, 8531307, ..."


In [52]:
def from_row_num_to_track_id(df, row_num):
    """ df must have a 'track_id' column """
    return df.iloc[row_num].track_id

def from_track_id_to_row_num(df, tr_id):
    """ df must have a 'track_id' column """
    return np.where(df.track_id.values == tr_id)[0][0]

## Create similarity matrix

In [121]:
most_popular = get_most_popular_tracks(train)
most_popular_tr_ids_5 = most_popular[most_popular.track_id.isin(target_tracks.track_id.values)].track_id[:5].values
most_popular_tr_ids_5

array([1563309, 1363985, 3705881, 1595978, 3779477])

In [179]:
tracks_in_playlist.head()

Unnamed: 0,playlist_id,track_ids
3271849,3271849,"[2801526, 1187176, 437130, 2989670, 675104, 28..."
5616275,5616275,"[727878, 2077024, 170846, 2182913, 1931420, 10..."
11267488,11267488,"[2805283, 3329183, 647761, 846652, 225495, 247..."
10103900,10103900,"[1515105, 496282, 2831388, 1739102, 301843, 24..."
3836898,3836898,"[2945623, 859729, 2012408, 350109, 83121, 7316..."


In [191]:
def build_sym_matrix(same_artist_param, same_album_param, common_tag_param, most_popular_param, in_same_playlist_param):
    S = lil_matrix((len(tracks), len(target_tracks)))
    r = 0
    for _,r1 in tracks.iterrows():
        for _,r2 in tracks_target_only[tracks_target_only.artist_id == r1.artist_id].iterrows():
            c = from_track_id_to_row_num(tracks_target_only, r2.track_id)
            same_artist = 1 # since having the same artist is a requesite for being similar
            same_album = 1*(r1.album == r2.album)
            common_tags = len(np.intersect1d(r1.tags, r2.tags))
            if (r1.track_id not in playlists_of_track.track_id) or (r2.track_id not in playlists_of_track.track_id):
                in_same_playlist = 0
            else:
                in_same_playlist = len(np.intersect1d(playlists_of_track.loc[r1.track_id].playlist_ids, playlists_of_track.loc[r2.track_id].playlist_ids))
            S[r,c] += same_artist_param*same_artist + same_album_param*same_album + common_tag_param*common_tags + in_same_playlist*in_same_playlist_param
        for tr_id in most_popular_tr_ids_5:
            c = from_track_id_to_row_num(tracks_target_only, tr_id)
            S[r,c] += most_popular_param
        r += 1
    return S

# Indexes of S:
#   - r: row number in 'tracks'
#   - c: row number in 'tracks_target_only'
#
# S is:
#        tracks_target_only
#          __________
#         |          |
# tracks  |          |   
#         |          |
#         |          |
#         |__________|

def predict_for_playlist(pl_id, target_tracks):
    suggested_tracks = {}
    for tr_id in tracks_in_playlist.loc[pl_id]['track_ids']:
        row_S = from_track_id_to_row_num(tracks, tr_id)
        r_start = S_csr.indptr[row_S]
        r_end = S_csr.indptr[row_S + 1]
        r_indices = S_csr.indices[r_start:r_end]
        r_data = S_csr.data[r_start:r_end]
        for i,c in enumerate(r_indices):
            c_track_id = from_row_num_to_track_id(tracks_target_only, c)
            if c_track_id not in suggested_tracks:
                suggested_tracks[c_track_id] = r_data[i]
            else:
                suggested_tracks[c_track_id] += r_data[i]
    suggested_tracks = [k for k,v in sorted([(k, v) for k, v in suggested_tracks.items()], key=lambda tup: tup[1], reverse=True)]
    i = 0
    count = 0
    pred = []
    while count < 5:
        if suggested_tracks[i] not in tracks_in_playlist.loc[pl_id]['track_ids']:
            # Predict track i
            pred.append(suggested_tracks[i])
            count += 1
        i += 1
    return np.array(pred)

def make_predictions(target_playlists, target_tracks):
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    
    predictions['track_ids'] = predictions['playlist_id'].apply(lambda pl_id: predict_for_playlist(pl_id, target_tracks))
    
    return predictions

def grid_search(same_artist_param_list, same_album_param_list, common_tag_param_list, most_popular_param_list, in_same_playlist_param_list, num_playlists):
    best_params = []
    best_score = 0
    for p1 in same_artist_param_list:
        for p2 in same_album_param_list:
            for p3 in common_tag_param_list:
                for p4 in most_popular_param_list:
                    for p5 in in_same_playlist_param_list:
                        same_artist_param = p1
                        same_album_param = p2 
                        common_tag_param = p3
                        most_popular_param = p4
                        in_same_playlist_param = p5
                        S = build_sym_matrix(same_artist_param, same_album_param, common_tag_param, most_popular_param, in_same_playlist_param)
                        S_csr = S.tocsr()
                        predictions = make_predictions(target_playlists[:num_playlists], target_tracks)
                        score = evaluate(test_good[test_good.playlist_id.isin(predictions.playlist_id)], predictions, should_transform_test=False)
                        print(same_artist_param, same_album_param, common_tag_param, most_popular_param, in_same_playlist_param)
                        print(score)
                        if score > best_score:
                            best_score = score
                            best_params = [same_artist_param, same_album_param, common_tag_param, most_popular_param, in_same_playlist_param]
    return (best_score, best_params)
    


In [None]:
sym_config = {
    'same_artist_param_list' : [1],
    'same_album_param_list' : [1.5],
    'common_tag_param_list' : [0.5],
    'most_popular_param_list' : [1],
    'in_same_playlist_param_list' : [0.2],
    'num_playlists' : 5
}

res = grid_search(**sym_config)
res