In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy import savetxt

In [2]:
# ---> initial stats
df = pd.read_csv("./CsvFiles/train_final.csv", sep= "\t", header = 0,dtype= {"playlist_id":np.int32,"track_id":np.int32} )
df['count'] = df.groupby('playlist_id')['track_id'].transform('count')

print('interaction: ' + str(len(df.index)))
print('playlist: ' + str(len(df.groupby('playlist_id'))))
print('tracks: ' + str(len(df.track_id.unique())))

interaction: 1040522
playlist: 45649
tracks: 99999


In [3]:
# ---> filter playlists
df2 = df
print('playlist before filter: ' + str(len(df2.groupby('playlist_id'))))
#df2 = df2[df2['count']>=8]
#df2 = df2[df2['count']<=50]
print('playlist after filter: ' + str(len(df2.groupby('playlist_id'))))
print('interaction: ' + str(len(df2.index)))
print('tracks: ' + str(len(df2.track_id.unique())))




playlist before filter: 45649
playlist after filter: 45649
interaction: 1040522
tracks: 99999


In [4]:
# ---> sample a subset of playlist (use all atm)

dfr = df2.groupby('playlist_id', as_index = False).count().sample(n=len(df2.groupby('playlist_id')))
df_test_user = df2.groupby('playlist_id', as_index = False).count().sample(10000)
df2 = df2[df2["playlist_id"].isin(dfr['playlist_id'].values) == True]
print('playlist after sample: ' + str(len(df2.groupby('playlist_id'))))
print('interaction: ' + str(len(df2.index)))
print('tracks: ' + str(len(df2.track_id.unique())))

playlist after sample: 45649
interaction: 1040522
tracks: 99999


In [6]:
# ---> filter tracks
df3 = df2
print('tracks before filter: ' + str(len(df3.groupby('track_id'))))
df3['count'] = df3.groupby('track_id')['playlist_id'].transform('count')
df3 = df3[df3['count']>=11]
print('tracks after filter: ' + str(len(df3.groupby('track_id'))))
print('playlist: ' + str(len(df3.groupby('playlist_id'))))
print('interaction: ' + str(len(df3.index)))

tracks before filter: 99999
tracks after filter: 23298
playlist: 40800
interaction: 633871


In [7]:
# ---> align dataset using dictionary
def align_dataset(x1, y1):
    x2, y2 = list(), list()

    id_p = 0
    id_t = 0
    dict_p = dict()
    dict_t = dict()
    i = 0
    for value in x1:
        if x1[i] not in dict_p:
            dict_p[x1[i]] = id_p
            id_p +=1
        if y1[i] not in dict_t:
            dict_t[y1[i]] = id_t
            id_t +=1
        x2.append(dict_p[x1[i]])
        y2.append(dict_t[y1[i]])
        i += 1

    x2 = np.array(x2)
    y2 = np.array(y2)
    return x2, y2, dict_p, dict_t

x1 = df3['playlist_id'].values
y1 = df3['track_id'].values

xa, ya, dict_p, dict_t=align_dataset(x1, y1)

print ("dataset aligned")

dataset aligned


In [8]:
# ---> save dataset
def save(x,y):
    l = len(df3['playlist_id'])
    M = np.zeros((l, 2))
    count = 0
    for value in x:
        M[count][0] = x[count]
        M[count][1] = y[count]
        count += 1
    savetxt("./CsvFiles/train_final2.csv", M.astype(int), fmt='%d\t%d', header="playlist_id\ttrack_ids",comments='')

save(xa,ya)
print ('file saved')

file saved


In [9]:
# ---> create train test and test set file
from MyReader import MyReader
dataReader = MyReader(splitTrainTest = True, trainPercentage = 0.8, loadPredefinedTrainTest = False)

URM_train = dataReader.get_URM_train()
URM_test = dataReader.get_URM_test()
print ("end import")

PTReader: loading data...
Processed 100000 cells
Processed 200000 cells
Processed 300000 cells
Processed 400000 cells
Processed 500000 cells
Processed 600000 cells
MyReader: saving URM_train and URM_test
MyReader: loading complete
end import


In [13]:
# --> align target tracks
dft = pd.read_csv("./CsvFiles/target_tracks.csv", sep= "\t", header = 0,dtype= {"track_id":np.int32} )
t_set = dft['track_id']
ta_set = list()

for t in t_set:
    if t in dict_t:
        ta_set.append(dict_t[t])
    else:
        print ("track not in dict: " + str(t))
    
ta_set = np.array(ta_set)

print ("target tracks aligned and filtered")

track not in dict: 3626362
target tracks aligned and filtered


In [15]:
# --> align target playlist
dfp = pd.read_csv("./CsvFiles/target_playlists.csv", sep= "\t", header = 0,dtype= {"track_id":np.int32} )
p_set = dfp['playlist_id']
pa_set = list()

for p in p_set:
    if p in dict_p:
        pa_set.append(dict_p[p])
    else:
        print ("playlist not in dict: " + str(p))
pa_set = np.array(pa_set)

print ("target playlists aligned and filtered")

target playlists aligned and filtered


In [None]:
# --> save aligned target tracks
savetxt("./CsvFiles/train_final2.csv", M.astype(int), fmt='%d\t%d', header="playlist_id\ttrack_ids",comments='')