In [1]:
from scipy.sparse import *
import numpy as np
import pandas as pd
import sys

from recsys.preprocess import *
from recsys.utility import *

In [14]:
def load_things(location):
    global train, test, playlists, tracks, target_tracks, target_playlists, tracks_in_playlist, tracks_target_only

    train = pd.read_csv(os.path.join(location, 'train.csv'))
    target_playlists = pd.read_csv(os.path.join(location, 'target_playlists.csv'))
    target_tracks = pd.read_csv(os.path.join(location, 'target_tracks.csv'))

    playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
    tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

    tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
    tracks.index = tracks.track_id

    tracks_in_playlist = get_playlist_track_list2(train)
    tracks_target_only = tracks[tracks.track_id.isin(target_tracks.track_id)]
    
    test = pd.read_csv(os.path.join(location, 'test.csv'))

In [3]:
def load_similarity(location):
    row = []
    col = []
    data = []
    content = None
    with open(os.path.join(location, 'similarity.txt'), 'r') as f:
        content = f.readlines()

    row = list(map(int, content[1].strip().split(' ')))
    col = list(map(int, content[2].strip().split(' ')))
    data = list(map(float, content[3].strip().split(' ')))

    coo = coo_matrix((data, (row, col)))
    csr = coo.tocsr()

    return csr

In [8]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

In [6]:
S = load_similarity(location)

In [9]:
def build_id_to_num_map(df, column):
    a = pd.Series(np.arange(len(df)))
    a.index = df[column]
    return a

def build_num_to_id_map(df, column):
    a = pd.Series(df[column])
    a.index = np.arange(len(df))
    return a

In [10]:
def load_URM():
    tr_map = build_id_to_num_map(tracks, 'track_id')
    pl_map = build_id_to_num_map(playlists, 'playlist_id')
    
    train_new = pd.DataFrame()
    train_new['track_id'] = train['track_id'].apply(lambda x : tr_map[x])
    train_new['playlist_id'] = train['playlist_id'].apply(lambda x : pl_map[x])
    
    rows = train_new['playlist_id'].values
    cols = train_new['track_id'].values
    values = np.ones(len(train_new))
    
    M = coo_matrix((values, (rows, cols)))
    return M.tocsr()
    

# Set location of the folder with all data

In [4]:
location = 'test2/'

In [15]:
load_things(location)

## Load URM

In [11]:
%%time
M = load_URM()

CPU times: user 18.8 s, sys: 145 ms, total: 19 s
Wall time: 19 s


## Prediction #1: dot product

Best result so far: 1.5 0.7 0 0 0.1 0 0.01

In [20]:
S = load_similarity(location)
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    r = M[pl_num,:].dot(S)
    idx = r.data.argsort()
    ranking = np.flip(r.indices[idx], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        i+=1
    predictions[pl_id] = np.array(pred)

pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
evaluate(test, pred)

0.0735092426952895

## Prediction #2: min distance

Best result so far: 1.5 0.7 0 0 0.1 0 0.01

In [19]:
S = load_similarity(location)
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    
    tmp_a = M[pl_num,:].nonzero()[1]
    tmp_c = S[tmp_a,:]
    tmp_b = tmp_c.data.argsort()
    ranking = np.flip(tmp_c.indices[tmp_b], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5:
        pred.append(0)
        i+=1
    predictions[pl_id] = np.array(pred)
pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
evaluate(test, pred)

0.06440838231535922