In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sc
from sklearn import preprocessing
from sklearn import model_selection

from recsys import preprocess
from recsys import utility

RANDOM_STATE = 2342

%matplotlib inline

In [2]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [48]:
x = pd.DataFrame(train['playlist_id'].drop_duplicates())
x.index = train['playlist_id'].unique()
x['tracks'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)

In [10]:
df = pd.DataFrame(data = get_target_playlists())
df.index = get_target_playlists()['playlist_id']

In [11]:
df['x'] = [np.array([]) for i in range(len(df))]

In [13]:
df.loc[10024884,('x',)] = np.array([1])

In [14]:
df

Unnamed: 0_level_0,playlist_id,x
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10024884,10024884,[1]
10624787,10624787,[]
4891851,4891851,[]
4267369,4267369,[]
65078,65078,[]
10637124,10637124,[]
3223162,3223162,[]
7541503,7541503,[]
6189367,6189367,[]
8459943,8459943,[]


In [58]:
pd.DataFrame?

In [21]:
%%time

from recsys.preprocess import *
from sklearn import model_selection
import numpy as np
from recsys.utility import *

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

train = get_train()
target_playlist = get_target_playlists()
target_tracks = get_target_tracks()


train, test, target_playlist, target_tracks = train_test_split(train, test_size=0.20)

most_popular = get_most_popular_tracks(train)
tracks_in_playlist = get_playlist_track_list2(train)

tracks_to_suggest = most_popular.index.values
predictions = []

predictions = pd.DataFrame(target_playlist)
predictions.index = target_playlist['playlist_id']
predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
predictions['track_ids'] = predictions['track_ids'].astype(object)

for it,row in target_playlist.iterrows():
    count = 0
    i = 0
    pred = []
    while count < 5:
        if tracks_to_suggest[i] not in tracks_in_playlist.loc[row['playlist_id']]['track_ids']:
            # Predict track i
            # IMPORTANT: should we check if the track to suggest is in target_tracks?
            pred.append(tracks_to_suggest[i])
            count += 1
        i += 1
    #predictions.loc[row['playlist_id'],'track_ids'] = np.array([])
    predictions.loc[row['playlist_id']] = predictions.loc[row['playlist_id']].set_value('track_ids', np.array(pred))
    #predictions.append(' '.join(map(str, pred)))

#results = pd.DataFrame(np.transpose([target_playlist['playlist_id'].values, predictions]), columns=['playlist_id', 'track_ids'])

#results.to_csv('results.csv', index=False)


CPU times: user 22.4 s, sys: 302 ms, total: 22.7 s
Wall time: 22.7 s


In [22]:
def evaluate(test, recommendations, should_transform_test=True):
    """
     - "test" is:
           if should_transform_test == False: a dataframe with columns "playlist_id" and "track_id".
           else: a dict with "playlist_id" as key and a list of "track_id" as value.
     - "recommendations" is a dataframe with "playlist_id" and "track_id" as numpy.ndarray value.
    """
    if should_transform_test:
        # Tranform "test" in a dict:
        #   key: playlist_id
        #   value: list of track_ids
        test_df = preprocess.get_playlist_track_list2(test)
    else:
        test_df = test
    
    print(test_df.iloc[:10])
    
    mean_ap = 0
    for _,row in recommendations.iterrows():
        pl_id = row['playlist_id']
        tracks = row['track_ids']
        correct = 0
        ap = 0
        for it, t in enumerate(tracks):
            if t in test_df.loc[pl_id]['track_ids']:
                correct += 1
                ap += correct / (it+1)
        ap /= len(tracks)
        mean_ap += ap

    return mean_ap / len(recommendations)

In [23]:
test.head()

Unnamed: 0,playlist_id,track_id
24.0,4878824,648387
26.0,4144882,1081479
116.0,10560762,3748363
120.0,2494890,2677523
142.0,6441207,1629252


In [24]:
evaluate(recommendations=predictions, test=test)

          playlist_id                                      track_ids
4878824       4878824    [648387, 3191548, 720730, 3187360, 1258042]
4144882       4144882   [1081479, 1306289, 681191, 3422255, 1855073]
10560762     10560762   [3748363, 463561, 3179150, 2547193, 2009544]
2494890       2494890   [2677523, 848916, 3221830, 1844060, 3305608]
6441207       6441207  [1629252, 1717060, 1842924, 2017173, 3191688]
4103126       4103126    [2027988, 200308, 3232697, 3250658, 709079]
721037         721037  [2063857, 1254963, 2855946, 3481379, 2956696]
8789434       8789434   [3138322, 3214478, 2696008, 2971227, 620395]
11222729     11222729   [948134, 2834348, 2149448, 2266763, 1537605]
3020491       3020491    [1885450, 910115, 1635000, 1257890, 139306]


0.001113269751560108

In [25]:
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))

In [26]:
predictions

Unnamed: 0_level_0,playlist_id,track_ids
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5631546,5631546,1563309 3705881 1363985 1595978 3779477
8180835,8180835,1563309 3705881 1363985 1595978 3779477
129558,129558,1563309 3705881 1363985 1595978 3779477
10048079,10048079,1563309 3705881 1363985 1595978 3779477
5810002,5810002,1563309 3705881 1363985 3779477 204966
11497662,11497662,1563309 3705881 1363985 1595978 3779477
7376648,7376648,1563309 3705881 1363985 1595978 3779477
10856669,10856669,1563309 3705881 1363985 1595978 3779477
3023131,3023131,1563309 3705881 1363985 1595978 3779477
6178128,6178128,1563309 3705881 1363985 1595978 3779477


In [57]:
from recsys.utility import *

In [None]:
evaluate(recommendations=predictions, test=test)

In [41]:
a = predictions.loc[10024884]

In [43]:
a.set_value('track_ids', np.array([1, 23]))

playlist_id    10024884
track_ids       [1, 23]
Name: 10024884, dtype: object

In [51]:
predictions.loc[10024884] = predictions.loc[10024884].set_value('track_ids', np.array([1, 2, 3]))

In [52]:
predictions

Unnamed: 0_level_0,playlist_id,track_ids
playlist_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10024884,10024884,"[1, 2, 3]"
10624787,10624787,[]
4891851,4891851,[]
4267369,4267369,[]
65078,65078,[]
10637124,10637124,[]
3223162,3223162,[]
7541503,7541503,[]
6189367,6189367,[]
8459943,8459943,[]


In [None]:
predictions.loc[10024884].set_value

In [67]:
a = df['tracks']

In [68]:
a.dtype = np.array

AttributeError: can't set attribute

In [3]:
train, test = model_selection.train_test_split(train, test_size=0.20, random_state=RANDOM_STATE)

In [8]:
small_train = train.loc[1:1000,:]

In [12]:
s = small_train.groupby('track_id')

In [22]:
a = s.count()

In [27]:
a.sort_values('playlist_id', ascending=False, inplace=True)

In [234]:
df.to_csv?

In [39]:
a = pd.DataFrame(columns=['a', 'b'])

In [None]:
train = get_train()
target_playlist = get_target_playlists()
target_tracks = get_target_tracks()

train, test = model_selection.train_test_split(train, test_size=0.20, random_state=RANDOM_STATE)

In [29]:
most_popular = get_most_popular_tracks(train)
tracks_in_playlist = get_playlist_track_list(train)

In [32]:
most_popular = most_popular.index.values

In [41]:
from recsys.preprocess import *
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sc
from sklearn import preprocessing
from sklearn import model_selection

RANDOM_STATE = 42

tracks_to_suggest = np.intersect1d(most_popular, target_tracks)
predictions = []

for it,row in target_playlist.iterrows():
    count = 0
    i = 0
    pred = []
    while count < 5:
        if tracks_to_suggest[i] not in tracks_in_playlist[row['playlist_id']]:
            # Predict track i
            pred.append(most_popular[i])
            count += 1
        i += 1
    predictions.append(' '.join(map(str, pred)))

In [43]:
results = pd.DataFrame(np.transpose([target_playlist['playlist_id'].values, predictions]), columns=['playlist_id', 'track_ids'])

In [46]:
len(test)

208105

In [47]:
from recsys.utility import *

In [48]:
def evaluate(test, recommendations):
    test_good = preprocess.get_playlist_track_list(test)

    mean_ap = 0
    for pl_id, tracks in recommendations:
        correct = 0
        ap = 0
        for it, t in enumerate(tracks):
            if t in test_good[pl_id]:
                correct += 1
                ap += correct / (it+1)
        ap /= len(tracks)
        mean_ap += ap

    return mean_ap / len(recommendations)


In [80]:
recomm = np.array([list(map(int,x.split(' '))) for x in results['track_ids'].values])

In [81]:
recomm

array([[1563309, 1363985, 3705881, 1595978, 3779477],
       [1563309, 1363985, 3705881, 1595978, 3779477],
       [1563309, 1363985, 3705881, 1595978, 3779477],
       ..., 
       [1563309, 1363985, 3705881, 1595978, 3779477],
       [1563309, 1363985, 3705881, 1595978, 3779477],
       [1563309, 1363985, 3705881, 1595978, 3779477]])

In [82]:
evaluate(test, recomm)

ValueError: too many values to unpack (expected 2)

In [225]:
def train_test_split(train, test_size=0.3):
    playlists = train.groupby('playlist_id').count()
    to_choose_playlists = playlists[playlists['track_id'] > 7].index.values
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))
    
    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        selected_df = train[train['playlist_id'] == p].sample(5)
        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)
    
    test = df.loc[indexes].copy()
    train = train.drop(indexes)
    
    return train, test, target_playlists, target_tracks
    

In [226]:
x,y,z,w = train_test_split(train)

In [227]:
len(train)

832417

In [228]:
len(x)

798407

In [229]:
len(y)

34010

In [230]:
len(z)

6802

In [231]:
len(w)

24281

In [233]:
len(y)/len(x)

0.04259732191726776

In [235]:
df.to_csv?

In [237]:
train[train['track_id'] == 252]

Unnamed: 0,playlist_id,track_id
269276,10550126,252
115607,8283678,252
999950,4322356,252
1007372,5554934,252
477645,1404694,252
405463,6102596,252
78094,7996678,252
701585,3853994,252
268901,11355891,252
590443,7333844,252


In [238]:
most_popular = get_most_popular_tracks(train)

In [241]:
most_popular

Unnamed: 0_level_0,playlist_id
track_id,Unnamed: 1_level_1
1563309,383
1363985,351
3705881,322
1595978,322
3779477,317
204966,311
2863395,301
3166665,292
1580480,290
1156143,282


In [243]:
target_tracks['track_id'].values

array([1316175, 3885714, 3091270, ..., 2739213, 2228646, 2265463])

In [251]:
len(np.intersect1d(most_popular.index.values[:10], target_tracks['track_id'].values))

10

In [247]:
len(target_tracks['track_id'])

32195

In [249]:
len(most_popular.index)

99789

In [252]:
most_popular

Unnamed: 0_level_0,playlist_id
track_id,Unnamed: 1_level_1
1563309,383
1363985,351
3705881,322
1595978,322
3779477,317
204966,311
2863395,301
3166665,292
1580480,290
1156143,282


In [215]:
playlists = train.groupby('playlist_id').count()
to_choose_playlists = playlists[playlists['track_id'] > 7].index.values

In [216]:
len(to_choose_playlists)

22676

In [217]:
len(playlists)

44884

In [185]:
playlists = a.groupby('playlist_id').count()

Unnamed: 0_level_0,track_id
playlist_id,Unnamed: 1_level_1
10732,1
18357,1
18766,1
21452,1
21567,1
22269,1
35761,1
35966,1
41685,1
45369,1


In [173]:
df.loc[[518209,598021]]

Unnamed: 0,playlist_id,track_id
518209,4546088,2285993
598021,4685686,3133472


In [131]:
a = np.array([1, 2, 3, 4, 5])

In [139]:
np.union1d(a,b)

array([      3,       4,       5,       6,       7,       8,  252429,
       1952325])

In [152]:
a = df.loc[[518209, 598021]]

In [157]:
len(df.drop([518209]))

832416

In [158]:
len(df)

832417

In [128]:
train[train['playlist_id'] == 6226465].sample(5)['track_id'].values

array([3667257, 2765634, 3001336, 3477637, 1277686])

In [117]:
len(to_choose_playlists)

18337

In [92]:
a = (train.groupby('playlist_id').count())

In [101]:
a[a['track_id'] > 10].index.values

array([    7692,     7816,     7912, ..., 11764165, 11764751, 11764851])

In [5]:
pd.Series?

In [24]:
a = pd.Series(dtype=np.ndarray)

Index([], dtype='object')

In [14]:
a

0      column
100       342
dtype: object

In [38]:
len(target_tracks)

32195

In [39]:
len(most_popular)

99789

In [40]:
len(np.intersect1d(target_tracks, most_popular))

32074