In [1]:
from scipy.sparse import *
import numpy as np
import pandas as pd
import sys
import math

from recsys.preprocess import *
from recsys.utility import *

In [2]:
def load_things(location, has_test = True):
    global train, test, playlists, tracks, target_tracks, target_playlists, tracks_in_playlist, tracks_target_only

    train = pd.read_csv(os.path.join(location, 'train.csv'))
    target_playlists = pd.read_csv(os.path.join(location, 'target_playlists.csv'))
    target_tracks = pd.read_csv(os.path.join(location, 'target_tracks.csv'))

    playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
    tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

    tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
    tracks.index = tracks.track_id

    tracks_in_playlist = get_playlist_track_list2(train)
    tracks_target_only = tracks[tracks.track_id.isin(target_tracks.track_id)]
    
    if has_test:
        test = pd.read_csv(os.path.join(location, 'test.csv'))

In [3]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

In [4]:
def build_id_to_num_map(df, column):
    a = pd.Series(np.arange(len(df)))
    a.index = df[column]
    return a

def build_num_to_id_map(df, column):
    a = pd.Series(df[column])
    a.index = np.arange(len(df))
    return a

In [5]:
def load_URM():
    tr_map = build_id_to_num_map(tracks, 'track_id')
    pl_map = build_id_to_num_map(playlists, 'playlist_id')
    
    train_new = pd.DataFrame()
    train_new['track_id'] = train['track_id'].apply(lambda x : tr_map[x])
    train_new['playlist_id'] = train['playlist_id'].apply(lambda x : pl_map[x])
    
    rows = train_new['playlist_id'].values
    cols = train_new['track_id'].values
    values = np.ones(len(train_new))
    
    M = coo_matrix((values, (rows, cols)))
    return M.tocsr()

In [9]:
location = 'submission1'

In [33]:
load_things(location, False)

In [34]:
train, test, target_playlists, target_tracks = train_test_split(train)

In [56]:
URM = load_URM()

In [57]:
URM.data *= 100

In [58]:
N_FEATURES = 5
N_EPOCHS = 5

In [59]:
userValue = np.zeros((URM.shape[0], N_FEATURES))
userValue += 0.1

In [60]:
itemValue = np.zeros((N_FEATURES,URM.shape[1]))
itemValue += 0.1

In [61]:
def predictRating(user, item, features):
    return np.dot(userValue[user,:features+1], itemValue[:features+1,item])

In [62]:
lrate = 0.01
K = 0.02
def train_user(user, item, rating, feature):
    err = (rating - predictRating(user, item, feature))
    
    userValue[user,feature] += lrate * (err * itemValue[feature,item] - K*userValue[user,feature])
    itemValue[feature,item] += lrate * (err * userValue[user,feature] - K*itemValue[feature, item])

In [63]:
URM = URM.tocoo()

In [64]:
%%time
for f in range(N_FEATURES):
    for i in range(N_EPOCHS):
        print("training feature {0}, stage {1}".format(f, i))
        for r,c in zip(URM.row, URM.col):
            train_user(r, c, 1, f)

training feature 0, stage 0
training feature 0, stage 1
training feature 0, stage 2
training feature 0, stage 3
training feature 0, stage 4
training feature 1, stage 0
training feature 1, stage 1
training feature 1, stage 2
training feature 1, stage 3
training feature 1, stage 4
training feature 2, stage 0
training feature 2, stage 1
training feature 2, stage 2
training feature 2, stage 3
training feature 2, stage 4
training feature 3, stage 0
training feature 3, stage 1
training feature 3, stage 2
training feature 3, stage 3
training feature 3, stage 4
training feature 4, stage 0
training feature 4, stage 1
training feature 4, stage 2
training feature 4, stage 3
training feature 4, stage 4
CPU times: user 1min 40s, sys: 434 ms, total: 1min 40s
Wall time: 1min 40s


In [65]:
userValue

array([[ 0.3484106 ,  0.20450948,  0.19200359,  0.18347331,  0.17698956],
       [ 0.14564625,  0.1370501 ,  0.13463041,  0.13276513,  0.13122209],
       [ 0.15551108,  0.13950976,  0.13736335,  0.13569922,  0.1343081 ],
       ..., 
       [ 0.1       ,  0.1       ,  0.1       ,  0.1       ,  0.1       ],
       [ 0.1       ,  0.1       ,  0.1       ,  0.1       ,  0.1       ],
       [ 0.19261852,  0.14123138,  0.13508252,  0.13103149,  0.1280592 ]])

In [None]:
userValue

In [47]:
len(URM.row)

998057

In [66]:
sum((100 - predictRating(r,c, 5))**2 for r,c in zip(URM.row, URM.col))

9879373315.5900249

In [None]:
sum((1 - predictRating(r,c, 0))**2 for r,c in zip(URM.row, URM.col))

In [67]:
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    
    r = np.dot(userValue[pl_num,:], itemValue)
    
    ranking = np.flip(r.argsort(), 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        i+=1
    predictions[pl_id] = np.array(pred)
    
pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
print(evaluate(test, pred))

0.0


In [48]:
predictions

{3664309: array([2895247, 1128441, 3559131, 1583562,  268373]),
 554529: array([1128441, 3559131,  474273, 1583562, 2895247]),
 11596724: array([2895247, 1128441, 3559131, 1583562,  268373]),
 3984698: array([ 474273, 1128441, 3559131, 2763511, 1470393]),
 4757773: array([ 474273, 2763511, 1128441, 3559131, 1470393]),
 6844003: array([2895247, 1128441, 3559131, 1583562,  268373]),
 8324919: array([1128441, 3559131,  474273, 2895247, 1583562]),
 7474767: array([1128441, 2895247, 3559131, 1583562,  474273]),
 8933180: array([ 474273, 1128441, 2763511, 3559131, 1470393]),
 6619854: array([1128441,  474273, 3559131, 2763511, 1470393]),
 6431028: array([1128441, 2895247, 3559131, 1583562,  268373]),
 6842662: array([1128441, 3559131, 2895247,  474273,  268373]),
 5647012: array([1128441, 3559131,  474273, 2895247, 1583562]),
 3978607: array([2895247, 1128441, 3559131, 1583562,  268373]),
 3803509: array([2895247, 1128441, 3559131, 1583562,  268373]),
 6793527: array([1128441, 3559131, 28952

In [None]:
predictions

In [24]:
pred

Unnamed: 0,playlist_id,track_ids
0,6469473,"[1321053, 209196, 4733003, 204966, 3862221]"
1,6900006,"[2609171, 3752712, 1193299, 1286763, 2655526]"
2,8161698,"[3796108, 2863395, 2609171, 363983, 1495432]"
3,3231231,"[2609171, 1074579, 3705881, 3796108, 363983]"
4,8170745,"[2863395, 3866410, 3796108, 3687848, 840796]"
5,6544319,"[1563309, 3705881, 1363985, 2981369, 1074579]"
6,8972292,"[1286763, 1595978, 611551, 3752712, 2032052]"
7,11549852,"[1563309, 2981369, 2967714, 183353, 1363985]"
8,5776564,"[1563309, 2981369, 3705881, 1363985, 2967714]"
9,7257649,"[1563309, 2618210, 675104, 1187176, 3144248]"


In [26]:
test[test['playlist_id'] == 6469473]

Unnamed: 0,playlist_id,track_id
288513.0,6469473,879546
305670.0,6469473,1026210
558325.0,6469473,3673500
665729.0,6469473,1569485
725669.0,6469473,1818796


In [29]:
r = np.dot(U[0,:], V)

In [31]:
r[27900]

5.6324401818372275e-05

In [32]:
max(r)

0.00081480120319666022

In [30]:
from_id_to_num(tracks, 879546)

27900

In [49]:
from scipy.sparse.linalg import svds

In [50]:
U, S, V = svds(URM, 5)

In [51]:
S = np.diag(S)

In [52]:
userValue = U
itemValue = np.dot(S, V)

In [53]:
userValue

array([[  1.43533161e-03,  -1.89739212e-03,   8.84917365e-04,
         -3.22831800e-03,   2.22694683e-03],
       [ -2.11646831e-04,  -7.25027759e-05,   7.19953722e-05,
         -1.47237123e-04,   6.06536430e-05],
       [  5.83563050e-06,  -2.35980266e-04,   1.82595357e-04,
         -1.05983720e-04,   2.27673496e-04],
       ..., 
       [ -2.25980716e-19,  -6.86561078e-19,  -7.60702534e-20,
          1.26754214e-19,   5.63646040e-19],
       [  4.04454531e-19,   1.03231335e-18,   8.09980148e-20,
         -3.53345780e-19,  -7.59772015e-19],
       [ -1.54870715e-04,  -3.61666941e-04,   7.48578050e-04,
         -7.10443517e-04,   3.83701465e-04]])

In [54]:
itemValue

array([[-0.03759189,  0.01595046, -0.16346831, ..., -0.03630885,
        -0.01185321, -0.00138212],
       [ 0.04762887,  0.08761549,  0.17095259, ...,  0.05361291,
         0.01302699, -0.00029386],
       [-0.01201329,  0.12321235, -0.02149348, ...,  0.01583089,
        -0.01458195,  0.00192119],
       [-0.01444753,  0.12302933, -0.05037432, ..., -0.03164104,
        -0.00263382,  0.00066702],
       [ 0.01781703,  0.0757955 ,  0.08973941, ...,  0.03584901,
         0.00635305,  0.00079538]])