In [1]:
from scipy.sparse import *
import numpy as np
import pandas as pd
import sys
import math

from recsys.preprocess import *
from recsys.utility import *

In [2]:
def load_things(location, has_test = True):
    global train, test, playlists, tracks, target_tracks, target_playlists, tracks_in_playlist, tracks_target_only

    train = pd.read_csv(os.path.join(location, 'train.csv'))
    target_playlists = pd.read_csv(os.path.join(location, 'target_playlists.csv'))
    target_tracks = pd.read_csv(os.path.join(location, 'target_tracks.csv'))

    playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
    tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

    tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
    tracks.index = tracks.track_id

    tracks_in_playlist = get_playlist_track_list2(train)
    tracks_target_only = tracks[tracks.track_id.isin(target_tracks.track_id)]
    
    if has_test:
        test = pd.read_csv(os.path.join(location, 'test.csv'))

In [3]:
def load_similarity(location):
    row = []
    col = []
    data = []
    content = None
    with open(os.path.join(location, 'similarity.txt'), 'r') as f:
        content = f.readlines()

    row = list(map(int, content[1].strip().split(' ')))
    col = list(map(int, content[2].strip().split(' ')))
    data = list(map(float, content[3].strip().split(' ')))

    coo = coo_matrix((data, (row, col)))
    csr = coo.tocsr()

    return csr

In [4]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

In [5]:
def build_id_to_num_map(df, column):
    a = pd.Series(np.arange(len(df)))
    a.index = df[column]
    return a

def build_num_to_id_map(df, column):
    a = pd.Series(df[column])
    a.index = np.arange(len(df))
    return a

In [6]:
def load_URM():
    tr_map = build_id_to_num_map(tracks, 'track_id')
    pl_map = build_id_to_num_map(playlists, 'playlist_id')
    
    train_new = pd.DataFrame()
    train_new['track_id'] = train['track_id'].apply(lambda x : tr_map[x])
    train_new['playlist_id'] = train['playlist_id'].apply(lambda x : pl_map[x])
    
    rows = train_new['playlist_id'].values
    cols = train_new['track_id'].values
    values = np.ones(len(train_new))
    
    M = coo_matrix((values, (rows, cols)))
    return M.tocsr()
    

# Set location of the folder with all data

In [7]:
location = 'submission2/'

In [9]:
load_things(location, False)

## Load URM

In [10]:
%%time
M = load_URM()

CPU times: user 19.2 s, sys: 127 ms, total: 19.3 s
Wall time: 19.3 s


In [11]:
M = M.tocsc()
for i in range(M.shape[1]):
    n_playlist = M.indptr[i+1] - M.indptr[i]
    if n_playlist >= 1:
        M.data[M.indptr[i]:M.indptr[i+1]] = np.repeat(math.log((M.shape[0]) / (n_playlist)), n_playlist)

In [11]:
M = M.tocsc()
max_pl_length = M.sum(0).A.max()
for i in range(M.shape[1]):
    n_playlist = M.indptr[i+1] - M.indptr[i]
    if n_playlist >= 1:
        M.data[M.indptr[i]:M.indptr[i+1]] = np.repeat(math.log((max_pl_length + 20) / (n_playlist)), n_playlist)
    else:
        print("argh")
        M.data[M.indptr[i]:M.indptr[i+1]] = np.repeat(math.log((max_pl_length + 20) / (n_playlist+5)), n_playlist)

argh


## Prediction #1: dot product

Best result so far: 

- 1.5 0.7 0 0 0.1 0 0.01   --> 0.0735
- 1.5 0.7 0 0 0.1 0 0 --> 0.0710
- 1 0.7 0.01 0 0.3 0 0 --> 0.0669
- 1.4 0.7 0.01 0 0.3 0 0.5 --> 0.0659
- 1.4 0.7 0.01 0 0.3 0.4 0.01 --> 0.0638
- 1.5 0.7 0 0 0.1 0 0 --> 0.07108
- 1.5 0.7 0 0 0.1 0 0.02 --> 0.07346
- 1.5 0.7 0 0 0.1 0 0.1 --> 0.734
- 1.7 0.7 0 0 0.1 0 0.5 --> 0.0585

== dataset 2

- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.07207

(restoring tag idf)
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0742
- 1.5 0.7 0 0 0.3 0 0.01 --> 0.07166
- 1.5 0.5 0.01 0.01 0.1 0 0.01 --> 0.07165

== dataset 3
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0722

(using tag IDF log(100000/#tag_freq)

== dataset 5
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07548  (URM changed)
- 1 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0739   (URM changed)
- 1 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0715

== dataset 7
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0723
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07467 (URM changed)
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07585 (URM changed v.2)


In [12]:
S = load_similarity(location)
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    r = M[pl_num,:].dot(S)
    idx = r.data.argsort()
    ranking = np.flip(r.indices[idx], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        i+=1
    predictions[pl_id] = np.array(pred)

pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
#evaluate(test, pred)

In [13]:
len(pred)

10000

In [None]:
pred['track_ids'] = pred['track_ids'].apply(lambda x : ' '.join(map(str, x)))

In [16]:
pred.to_csv(os.path.join(location,'results.csv'), index=False)

In [17]:
pred

Unnamed: 0,playlist_id,track_ids
0,10024884,2327471 2730084 2985464 3749918 1425053
1,10624787,2016284 3779369 381424 309262 3392560
2,4891851,1406862 1371741 1886350 2089117 166570
3,4267369,2504992 2575850 290643 2740798 3214480
4,65078,3138311 2225630 1210951 3341178 431268
5,10637124,2340644 2123901 414791 1120194 3559283
6,3223162,2787106 281865 303587 2296339 1510434
7,7541503,661053 2086779 585705 2889733 3219595
8,6189367,495158 1054050 276186 1195554 1675280
9,8459943,3129861 2955950 1070186 826298 1101374


## Prediction #2: min distance

Best result so far: 

- 1.5 0.7 0 0 0.1 0 0.01 --> 0.06
- 1.5 0.7 0 0 0.1 0 0 --> 0.0631
- 1 0.7 0.01 0 0.3 0 0 --> 0.0628
- 1.4 0.7 0.01 0 0.3 0 0.5 --> 0.0639
- 1.4 0.7 0.01 0 0.3 0.4 0.01 --> 0.0589
- 1.5 0.7 0 0 0.1 0 0 --> 0.06317
- 1.5 0.7 0 0 0.1 0 0.02 --> 0.0644
- 1.5 0.7 0 0 0.1 0 0.1 --> 0.6441
- 1.7 0.7 0 0 0.1 0 0.5 --> 0.6509

== dataset 2

- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0632

(restoring tag idf)
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.06777
- 1.5 0.7 0 0 0.3 0 0.01 --> 0.0677
- 1.5 0.5 0.01 0.01 0.1 0 0.01 --> 0.06784

(using tag IDF log(100000/#tag_freq)

== dataset 5
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0704

In [None]:
S = load_similarity(location)
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    
    tmp_a = M[pl_num,:].nonzero()[1]
    tmp_c = S[tmp_a,:]
    tmp_b = tmp_c.data.argsort()
    ranking = np.flip(tmp_c.indices[tmp_b], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5:
        pred.append(0)
        i+=1
    predictions[pl_id] = np.array(pred)
pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
evaluate(test, pred)

In [None]:
0.06932