In [20]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds

from recsys.preprocess import *
from recsys.utility import *

RANDOM_STATE = 2342

np.random.seed(RANDOM_STATE)

%matplotlib inline

In [21]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [22]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=0.30, min_playlist_tracks=10)

In [23]:
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])


train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

In [24]:
target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

In [25]:
playlist_tracks = get_playlist_track_list2(train)

In [26]:
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Build URM

In [27]:
M = lil_matrix((len(playlists), len(tracks)))

for pl_id, row in playlist_tracks.iterrows():
    row['track_ids'].sort()
    pl_length = len(row['track_ids'])
    for track_id in row['track_ids']:
        M[pl_id,track_id] = 100/(pl_length + 3)
        

# Normalize

In [28]:
M = M.tocsc()

In [40]:
%%time
U, S, V = svds(M, k=200)

CPU times: user 3.97 s, sys: 54.4 ms, total: 4.02 s
Wall time: 1.02 s


In [49]:
S = np.diag(S)

In [None]:
M2 = np.dot(S, V)

In [51]:
%%time
predictions = pd.DataFrame(target_playlists)
predictions.index = target_playlists['playlist_id']
predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
ttracks = set(target_tracks['track_id'].values)
for _,row in target_playlists.iterrows():
    pred = []
    pl_id = row['playlist_id']
    pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])
    simil = np.dot(U[pl_id,:], M2)
    sorted_ind = simil.argsort()
    i = len(sorted_ind) - 1
    c = 0
    while i > 0 and c < 5:
        if (sorted_ind[i] in ttracks) and (sorted_ind[i] not in pl_tracks):
            pred.append(num_to_tracks[sorted_ind[i]])
            c+=1
        i-=1
    predictions.loc[row['playlist_id']] = predictions.loc[row['playlist_id']].set_value('track_ids', np.array(pred))

CPU times: user 2min 18s, sys: 14.8 s, total: 2min 33s
Wall time: 1min 41s


In [52]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [53]:
evaluate(test, predictions)

0.005260432378079437

In [66]:
def print_tag(id):
    t = tracks.loc[id]
    res = []
    res.append(t['track_id'])
    
    album = t['album'][1:-1]
    if (album == ''):
        res.append(-1)
    else:
        res.append(album)
    
    artist_id = t['artist_id']
    if artist_id == '':
        res.append(-1)
    else:
        res.append(artist_id)
        
    duration = t['duration']
    res.append(duration if duration > 0 else 0)
    
    playcount = t['playcount']
    try:
        res.append(int(playcount))
    except ValueError:
        res.append(0)
        
    tags = eval(t['tags'])         
    res.append(len(tags)) 
    res.extend(tags)              
    return ' '.join(map(str, res))

In [68]:
print_tag(99999)

'99999 -1 567363 0 0 0'

In [70]:
with open('tracks_num.txt', 'w') as out:
    out.write(str(len(tracks)) + '\n')
    for i in range(len(tracks)):
        out.write(print_tag(i) + '\n')
    