In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
import gc
import codecs

In [2]:
import re
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@-]", ' ', name)
    name = re.sub(r"&", 'and', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

cold_start_recommendation = pd.read_csv('title_popularity_recs.csv')

# Read the Dataset

In [18]:
def create_df_data():
    #Save data from Columns of interest
    playlist_col = ['num_tracks', 'pid']
    tracks_col = ['album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    playlist_test_col = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']
    
    
    #The million playlist data path
    filenames = os.listdir('mpd/data/')
    
    #initialize the variables
    data_playlists = []
    data_tracks = []
    playlists = []
    tracks = set()
    total_time = 0
    
    print("*****************************************************")
    print("Reading the slices containing playlists")
    for filename in tqdm(filenames):
        fullpath = os.sep.join(('mpd//data/', filename))
        f = open(fullpath)
        js = f.read()
        f.close()
        
        #Load every slice
        mpd_slice = json.loads(js)
        
        for playlist in mpd_slice['playlists']:
            data_playlists.append([playlist[col] for col in playlist_col])
            for track in playlist['tracks']:
                playlists.append([playlist['pid'], track['track_uri'], track['pos']])
                if track['track_uri'] not in tracks:
                    data_tracks.append([track[col] for col in tracks_col])
                    tracks.add(track['track_uri'])
    gc.collect()
    print("Reading the challenge dataset")
    f = open('challenge/challenge_set.json')
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)

    data_playlists_test = []
    playlists_test = []
    print(len(tracks))
    recs_cold_start= ['']
    
    #This is required for the online submission
    first_line = 'team_info,team_name,main,your@email.com'
    df_tracks_training = pd.DataFrame(data_tracks, columns=tracks_col)
    with codecs.open('Submission_1.csv', "w") as o:
        o.write("%s \n" %(first_line))
        o.write("\n")
    for playlist in tqdm((mpd_slice['playlists'])):       
        for track in playlist['tracks']:
            playlists_test.append([playlist['pid'], track['track_uri'], track['pos']])
            if track['track_uri'] not in tracks:
                data_tracks.append([track[col] for col in tracks_col])
                tracks.add(track['track_uri'])
        #Check if the challenge set contains any tracks
        if(playlist['num_samples'] == 0):
            name = playlist['name']
            pid = playlist['pid']
            name = normalize_name(name)
            #Find the playlists with same titles
            row = cold_start_recommendation[cold_start_recommendation['Names'] == name]
            row = row.dropna(axis=1, how='all')
            #If the title has 500 songs that can be recommended, add them to the final submission
            if row.shape[1] == 501:
                df_list = row.values.tolist()
                df_list = df_list.pop()[1:]
                with codecs.open('Submission_1.csv', "a") as o:
                    o.write("%s" %(pid) + ',')
                    recs = ','.join(map(str, df_list))
                    o.write(recs)
                    o.write("\n")
                    o.write("\n")
                recs = ['']
            else:  
                #Add random 2 tracks as seed tracks
                appendTracks = []
                mTracks = {'album_uri':'x', 'artist_name' :'y', 'artist_uri':'x', 
                      'duration_ms':0, 'track_name ':'f', 'track_uri': 'spotify:track:67fNcOYgQ0jnEqcn7U4Mzo', 'pos':0}
                mTracks1 = {'album_uri':'x', 'artist_name' :'y', 'artist_uri':'x', 
                      'duration_ms':0, 'track_name ':'f_1', 'track_uri': 'spotify:track:0GZoB8h0kqXn7XFm4Sj06k', 'pos':1}
                appendTracks.append(mTracks)
                appendTracks.append(mTracks1)
                playlist['tracks'] = appendTracks
                for track in playlist['tracks']:
                    playlists.append([playlist['pid'], track['track_uri'], track['pos']])
                    playlists_test.append([playlist['pid'], track['track_uri'], track['pos']])

        data_playlists_test.append([playlist.get(col, '') for col in playlist_test_col])
    #Store the data collected from playlists to dataframe           
    df_playlists_info = pd.DataFrame(data_playlists, columns=playlist_col)
    df_tracks = pd.DataFrame(data_tracks, columns=tracks_col)
    df_tracks['tid'] = df_tracks.index

    track_uri2tid = df_tracks.set_index('track_uri').tid

    df_playlists = pd.DataFrame.from_records(playlists, columns=['pid', 'tid', 'pos'])
    df_playlists.tid = df_playlists.tid.map(track_uri2tid)

    df_playlists_test_info = pd.DataFrame.from_records(data_playlists_test, columns=playlist_test_col)

    df_playlists_test = pd.DataFrame.from_records(playlists_test, columns=['pid', 'tid', 'pos'])
    df_playlists_test.tid = df_playlists_test.tid.map(track_uri2tid)

    print("df_tracks")
    print(df_tracks)
    print("df_tracks_info")
    print(df_playlists)
    print("df_playlists")
    print(df_playlists_info)
    print("df_playlists_test")
    print(df_playlists_test)
    print("df_playlists_test_info")
    print(df_playlists_test_info)
    
    #Store the dataframe in hdf files for further usage
    df_tracks.to_hdf('hdf_files/tracks_info.hdf', key ='a')
    df_playlists.to_hdf('hdf_files/tracks.hdf', key ='a')
    df_playlists_info.to_hdf('hdf_files/playlists_info.hdf', key='a')
    df_playlists_test.to_hdf('hdf_files/tracks_test.hdf', key='a')
    df_playlists_test_info.to_hdf('hdf_files/playlists_test_info.hdf',key='a')


In [19]:

if __name__ == "__main__":
    print(__doc__)
    create_df_data()

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Automatically created module for IPython interactive environment
*****************************************************
Reading the slices containing playlists


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.39s/it]


Reading the challenge dataset
110716


100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:09<00:00, 1072.20it/s]


df_tracks
                                   album_uri     artist_name  \
0       spotify:album:4S5MLjwRSi0NJ5nikflYnZ       AronChupa   
1       spotify:album:1qHVYbxQ6IS8YRviorKDJI       AronChupa   
2       spotify:album:4UEPxQx0cTcYNsE0n32MHV           Lorde   
3       spotify:album:0rmhjUgoVa17LZuS8xWQ3v           Lorde   
4       spotify:album:0rmhjUgoVa17LZuS8xWQ3v           Lorde   
...                                      ...             ...   
110711  spotify:album:6wNKjgN0Fo7jnOpAHXDLeF     Soft Swells   
110712  spotify:album:4gCXFcJXBeseiCQ0GCakO0     Paper Route   
110713  spotify:album:1NhFksWs1Nsz6wQI8ysTkv  Brandi Carlile   
110714  spotify:album:1wgGprEG5HgYbhFq3eMmtO    Doom & Gloom   
110715  spotify:album:1wgGprEG5HgYbhFq3eMmtO         Badwolf   

                                   artist_uri  duration_ms  \
0       spotify:artist:5vCOdeiQt9LyzdI87kt5Sh       163809   
1       spotify:artist:5vCOdeiQt9LyzdI87kt5Sh       166848   
2       spotify:artist:163tK9Wjr9P9

In [21]:
playlists = pd.read_hdf('hdf_files/playlists_test_info.hdf') 
df_temp = playlists.loc[:,['num_tracks','pid']]
df_temp = df_temp.set_index('pid')
#Create a list with the playlist_ID and num_tracks in each playlist
df_temp.to_hdf('hdf_files/df_playlist_test.hdf', key='a')

# Create the Song-Playlist and Playlist-Song Data

In [24]:
import sys

def main(argv):

    training_data = pd.read_hdf('hdf_files/tracks.hdf')
    testing_data = pd.read_hdf('hdf_files/tracks_test.hdf')
    print("training_data")
    print(training_data)
    print("testing_data")
    print(testing_data)
    
    tracks = pd.read_hdf('hdf_files/tracks.hdf')
   
    # Build playlist-song data for training and testing
    print("Building the song-playlist matrix for training set")
    pid = testing_data.groupby(by='tid')['pid'].apply(list)
    pos = testing_data.groupby(by='tid')['pos'].apply(list)
    sp_train = pd.concat([pid,pos],axis=1)
    print("sp_train")
    print(sp_train)
      
    
    print("Build song-playlist data for testing set")
    pid = testing_data.groupby(by='tid')['pid'].apply(list)
    pos = testing_data.groupby(by='tid')['pos'].apply(list)
    sp_test = pd.concat([pid,pos],axis=1)
    print("sp_test")
    print(sp_test)

 
    tid = tracks.groupby(by='tid')['pid'].apply(list)
    pos = tracks.groupby(by='tid')['pos'].apply(list)
    sp_complete = pd.concat([tid,pos],axis=1)   
    
    
    # Build playlist-song matrix
    print("Build playlist-song data for training set")
    tid = training_data.groupby(by='pid')['tid'].apply(list)
    pos = training_data.groupby(by='pid')['pos'].apply(list)
    ps_train = pd.concat([tid,pos],axis=1)
    print(ps_train)
      
    
    print("Build playlist-song data for testing set")
    tid = testing_data.groupby(by='pid')['tid'].apply(list)
    pos = testing_data.groupby(by='pid')['pos'].apply(list)
    ps_test = pd.concat([tid,pos],axis=1)
    print(ps_test)

    
    tid = tracks.groupby(by='pid')['tid'].apply(list)
    pos = tracks.groupby(by='pid')['pos'].apply(list)
    ps_complete = pd.concat([tid,pos],axis=1)    
    print(ps_complete)
    
    print("Saving the Data Files")
        
    sp_train.to_hdf("hdf_files/sp_train_new.hdf",key='abc')
    sp_test.to_hdf("hdf_files/sp_test_new.hdf",key='abc')
    sp_complete.to_hdf("hdf_files/sp_complete_new.hdf",key='abc')
    ps_train.to_hdf("hdf_files/ps_train_new.hdf",key='abc')
    ps_test.to_hdf("hdf_files/ps_test_new.hdf",key='abc')
    ps_complete.to_hdf("hdf_files/ps_complete_new.hdf",key='abc')
        
if __name__ =="__main__":
    main(sys.argv)

training_data
            pid    tid  pos
0       1000000      0    0
1       1000000      1    1
2       1000000      2    2
3       1000000      3    3
4       1000000      4    4
...         ...    ...  ...
480965  1002557      2    1
480966  1002589  48981    0
480967  1002589      2    1
480968  1002590  48981    0
480969  1002590      2    1

[480970 rows x 3 columns]
testing_data
            pid    tid  pos
0       1000047  48981    0
1       1000047      2    1
2       1000129  48981    0
3       1000129      2    1
4       1000165  48981    0
...         ...    ...  ...
281091  1006767  66241    0
281092  1006771   3490    0
281093  1006773  66242    0
281094  1006775  17771    0
281095  1006778   6339    0

[281096 rows x 3 columns]
Building the song-playlist matrix for training set
sp_train
                                                     pid  \
tid                                                        
0                                     [1000000, 1039848]   
1      

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['pid', 'pos'], dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['tid', 'pos'], dtype='object')]

  pytables.to_hdf(


# Build the Playlist-Song Matrix

In [29]:
import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix,save_npz,csc_matrix
import argparse
import sys
from multiprocessing import Pool
import os
import time
import pickle

def main():
    
    out_filename = "playlist_song.pickle"
   
    ps_train = pd.read_hdf("hdf_files/ps_train_new.hdf")
    print(ps_train)
    sp_complete = pd.read_hdf("hdf_files/sp_complete_new.hdf")
    ps_complete = pd.read_hdf("hdf_files/ps_complete_new.hdf")
    ps_test = pd.read_hdf("hdf_files/ps_test_new.hdf")
    pid_list_complete = list(ps_complete.index)
    pid_list_test= list(ps_test.index)
    print("Number of PIDs in the testing data ", len(pid_list_test) , " maximum Value ", max(pid_list_test))
    print("Number of PIDs in the training data", len(pid_list_complete) , " max ", max(pid_list_complete))

    # Get tid list
    tid_list = list(sp_complete.index)
    num_tid = len(tid_list)
    print("num_tid ", num_tid)
    dict_index = {k:v for k,v in zip(tid_list,range(0,num_tid))}

    
    # get pid list in train set
    pid_list_train = list(ps_train.index)
    num_pid = len(pid_list_train)
    print("num_pid ", num_pid)
    print("Create rating matrix")
    max_val_pid = max(max(pid_list_train),max(pid_list_test)) +1
    max_val_tid = max(tid_list)+1
    ps_matrix = dok_matrix((max_val_pid, max_val_tid), dtype=np.float32)
    
    del sp_complete
    
    for i in range(num_pid):
        pid = pid_list_train[i]
        tid = ps_train.loc[pid,'tid']
        index_pid = pid
        index_tid = [dict_index.get(i) for i in tid]
        #Update the value to 1 if track present in the playlist
        ps_matrix[index_pid,index_tid] = 1 

    print("Saving the file")
    with open(out_filename, 'wb') as f:
        pickle.dump(ps_matrix, f)    

if __name__ =="__main__":
    main()

                                                       tid  \
pid                                                          
0        [1573, 4252, 4815, 7609, 571, 1318, 4795, 1079...   
1        [1697, 66244, 66245, 66246, 66247, 66248, 6624...   
2        [66259, 66260, 66261, 66262, 29808, 66263, 662...   
3        [66289, 66290, 63238, 12498, 66291, 66292, 662...   
4        [15667, 3982, 4511, 66376, 426, 1781, 2415, 70...   
...                                                    ...   
1049269  [65913, 65914, 2, 4480, 65915, 17166, 7964, 12...   
1049300  [18199, 65937, 65938, 65939, 65940, 6609, 5178...   
1049316  [65961, 5391, 24287, 17277, 4120, 46031, 5872,...   
1049352  [21198, 66007, 19696, 20627, 1465, 1592, 24523...   
1049360  [3458, 14726, 8617, 14728, 32742, 1559, 2529, ...   

                                                       pos  
pid                                                         
0        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...  
1        [

# Build the Similarity Matrix for Playlists

In [33]:
import sklearn.preprocessing as pp
import scipy.sparse as sp
from scipy.sparse import dok_matrix,csc_matrix,csr_matrix,vstack
import pickle
import pandas as pd
from multiprocessing import Pool,Value
import time
import argparse
from sklearn.metrics.pairwise import cosine_similarity,paired_cosine_distances


def cosine_similarities_playlist(pid_list_test,ps_matrix):
    ps_matrix_norm = pp.normalize(ps_matrix, axis=1)
    ps_matrix_test = ps_matrix_norm[pid_list_test,:]
    return ps_matrix_test * ps_matrix_norm.T

def cosine_similarities_song(index_tid_list_test,ps_matrix):
    ps_matrix_norm = pp.normalize(ps_matrix, axis=0)
    ps_matrix_test = ps_matrix_norm[:,index_tid_list_test]
    return ps_matrix_test.T * ps_matrix_norm

def main():

    pickle_path = 'playlist_song.pickle'

    with open(pickle_path,'rb') as f:
        ps_matrix = pickle.load(f)      

    #Convert to sparse matrix
    ps_matrix_col = ps_matrix.tocsc() 
    ps_matrix_row = ps_matrix.tocsr()


    sp_train = pd.read_hdf("hdf_files/sp_train_new.hdf")
    sp_test = pd.read_hdf("hdf_files/sp_test_new.hdf")
    ps_test = pd.read_hdf("hdf_files/ps_test_new.hdf")
    
    
    pid_list_test = list(ps_test.index)
    print("Building the cosine similarity playlists")
    ps_sim_playlist = cosine_similarities_playlist(pid_list_test,ps_matrix_row)

    print("Save similarity matrix playlist")
    
    out_filename = "cosineSimMatrix_playlist.pickle"
    
    with open(out_filename, 'wb') as f:
        pickle.dump(ps_sim_playlist, f,protocol=4)  
        
    # Get tid list
    tid_list = list(sp_train.index)
    num_tid = len(tid_list)
    dict_index = {k:v for k,v in zip(tid_list,range(0,num_tid))}
    tid_list_test = list(sp_test.index)
    index_tid_list_test = [dict_index.get(i) for i in tid_list_test]
    
    print("Build cosine similarity matrix for songs")
    ps_sim_song = cosine_similarities_song(index_tid_list_test,ps_matrix_col)    
    
    print("Save similarity matrix song")

    out_filename = "cosineSimMatrix_song.pickle"
    
    with open(out_filename, 'wb') as f:
        pickle.dump(ps_sim_song, f,protocol = 4)      
    
if __name__ =="__main__":
    main()
  
   

Building the cosine similarity playlists
Save similarity matrix playlist
Build cosine similarity matrix for songs
Save similarity matrix song


# Generate the Recommendations

In [34]:
import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix,csc_matrix
import argparse
import sys
import pickle
from multiprocessing import Pool,Value
import time

pickle_path = 'playlist_song.pickle'

print("Load Rating matrix")
with open(pickle_path,'rb') as f:
    ps_matrix = pickle.load(f)      

# Change to column sparse matrix because it is much faster to get column 12s -> 0.08s
ps_matrix = ps_matrix.tocsc() 



print("Load Similarity Matrix")
sim_path = 'cosineSimMatrix_playlist.pickle'
with open(sim_path,'rb') as f:
    sim_matrix = pickle.load(f)      
    
# Change to column sparse matrix because it is much faster to get column 12s -> 0.08s
sim_matrix = sim_matrix.tocsr()   

#Generate 600 tracks
num_of_tracks = 600

def chunkify(lst,n):
    return [lst[i::n] for i in range(n)]


Load Rating matrix
Load Similarity Matrix


In [52]:
def main():
    
    # Playlist-Song Matrix
    ps_train = pd.read_hdf("hdf_files/ps_train_new.hdf")
    ps_test = pd.read_hdf("hdf_files/ps_test_new.hdf")
    sp_train = pd.read_hdf("hdf_files/sp_train_new.hdf")
    ps_test_1 = pd.read_hdf("hdf_files/ps_test_new.hdf")   

    print("Building Playlist and Song List")
    # Get tid list
    tid_list = list(sp_train.index)
    num_tid = len(tid_list)   
    tid_index = list(np.arange(0,num_tid))
    
    # get pid list in training set
    pid_list = list(ps_train.index)  
    
    # get pid list in test set
    pid_list_test = list(ps_test.index)
    print(len(pid_list_test), "length of pid_list_test")
    
    #Save the recommendations  
    record = []
    #Required if the training terminates in between on HPC
#     df_read_ch_1 = pd.read_csv("Records.csv")
#     record = df_read_ch_1.values.tolist()
#     print("length record", len(record))

    #Divide data in chunks
    pid_list_test_divided = chunkify(pid_list_test, 15)
    for j in range(len(pid_list_test_divided)):
        iter_playlist = pid_list_test_divided[j]
        print("j ", j)
        for i in range(len(iter_playlist)):
            pid = iter_playlist[i]
            start = time.time()
            vector1 = ps_test.loc[pid,'tid']
            sim_vector_ = sim_matrix[i,:]     
            norm = np.sum(sim_vector_)

            rating = sim_vector_.dot(ps_matrix)       
            rating = rating / norm

            rating_array = rating.toarray()[0]
            counter_list = list(enumerate(rating_array, 0))

            # Filter songs already present in the training set
            counter_list_filter = [(x,y) for x,y in counter_list if x not in vector1]

            # Sort by rating
            sortedList = sorted(counter_list_filter, key=lambda x:x[1],reverse=True)

            new_tid = []
            for i,_ in sortedList[:(num_of_tracks)]:
                new_tid.append(i)
            record.append(new_tid)
#          Required for HPC
        df = pd.DataFrame (record)
#         print("*************************************************************")
#         print("Playlist Number saved", j)
        df.to_csv("Records.csv")
    
#     print("Create new dataframe")
#     df_read_ch = pd.read_csv("Records.csv")
#     abd = df_read_ch.values.tolist()
#     print("length record", len(abd))
    ps_test['tid'] = record
    
    print("Saving the test data")
    ps_test.to_hdf('hdf_files/ps_test_complete_CF_playlist.hdf', key='abc')

if __name__ =="__main__":
    main()

Building Playlist and Song List
9048 length of pid_list_test
j  0
j  1
j  2
j  3
j  4
j  5
j  6
j  7
j  8
j  9
j  10
j  11
j  12
j  13
j  14
Saving the test data


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['tid', 'pos'], dtype='object')]

  pytables.to_hdf(


# Write Result to Submission file 

In [53]:
ps_test_older = pd.read_hdf("hdf_files/tracks_test.hdf")
tid_org_list = ps_test_older.groupby(by='pid')['tid'].apply(list).reset_index(name = 'tid')


In [54]:
tracks_info = pd.read_hdf("hdf_files/tracks_info.hdf")
ps_test = pd.read_hdf("hdf_files/ps_test_complete_CF_playlist.hdf")

ps_test.reset_index(drop=True)
print(ps_test['tid'])

pid_tid = ps_test['tid']

pid_tid = pid_tid.reset_index()

pid
1000000    [48981, 1133, 6180, 1015, 1108, 1109, 499, 156...
1000001    [1721, 11603, 6866, 11610, 15717, 15729, 11609...
1000009    [1058, 643, 215, 7004, 1968, 1059, 15734, 1573...
1000016    [7, 9, 8, 5, 1567, 3656, 574, 1132, 5807, 6, 4...
1000020    [11, 12, 10, 13, 14, 4390, 3563, 275, 1739, 93...
                                 ...                        
1049269    [2470, 5873, 1354, 6466, 667, 3272, 2043, 1994...
1049300    [1490, 404, 1491, 330, 1127, 4426, 5809, 2021,...
1049316    [841, 1494, 1492, 13938, 5595, 1493, 1495, 843...
1049352    [1500, 1498, 1195, 100, 1654, 6535, 8096, 3152...
1049360    [117, 2474, 1940, 2630, 1930, 1502, 3834, 331,...
Name: tid, Length: 9048, dtype: object


In [None]:
import codecs
print(pid_tid)

recs = ['']
tid_list_added = ['']

tid_info = tracks_info[['tid', 'track_uri']]
tid_info = tid_info.set_index('tid').track_uri


with codecs.open('Submission_1.csv', "a") as o:
    for index, row in pid_tid.iterrows():
#         print(index)
        pid = row['pid']
#         print("pid ", pid)
#         b = int(pid)
        tid_list = row['tid']
        list_tid = tid_org_list[tid_org_list['pid'] == pid]['tid']
        a = []
        for i in list_tid:
            a.append(i) 
        
        count = 0
        for i in range(600):
            if tid_list[i] not in tid_list_added and count<500 and tid_list[i] not in a[0]:
                count += 1
                tid_list_added.append(tid_list[i])
                recs.append(tid_info[tid_list[i]])
        o.write("%s" %(pid))
        recs = ','.join(map(str, recs))
        o.write(recs)
        o.write("\n")
        o.write("\n")
        tid_list_added = ['']
        recs = ['']



          pid                                                tid
0     1000000  [48981, 1133, 6180, 1015, 1108, 1109, 499, 156...
1     1000001  [1721, 11603, 6866, 11610, 15717, 15729, 11609...
2     1000009  [1058, 643, 215, 7004, 1968, 1059, 15734, 1573...
3     1000016  [7, 9, 8, 5, 1567, 3656, 574, 1132, 5807, 6, 4...
4     1000020  [11, 12, 10, 13, 14, 4390, 3563, 275, 1739, 93...
...       ...                                                ...
9043  1049269  [2470, 5873, 1354, 6466, 667, 3272, 2043, 1994...
9044  1049300  [1490, 404, 1491, 330, 1127, 4426, 5809, 2021,...
9045  1049316  [841, 1494, 1492, 13938, 5595, 1493, 1495, 843...
9046  1049352  [1500, 1498, 1195, 100, 1654, 6535, 8096, 3152...
9047  1049360  [117, 2474, 1940, 2630, 1930, 1502, 3834, 331,...

[9048 rows x 2 columns]


# Verify the Submission File

In [None]:
#ref : https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge
"""
  Verifies that a given challenge submision is properly constructed.

  Usage:

        python verify_submission.py challenge_set.json submission.csv
"""
import sys
import json

NTRACKS = 500


def verify_submission(challenge_path, submission_path):
    has_team_info = False
    error_count = 0

    try:
        f = open(challenge_path)
        js = f.read()
        f.close()
        challenge = json.loads(js)
    except FileNotFoundError:
        error_count += 1
        print("Can't read the challenge set")
        return error_count

    pids = set([playlist["pid"] for playlist in challenge["playlists"]])
    if len(challenge["playlists"]) != 10000:
        print("Bad challenge set")
        error_count += 1

    # seed_tracks contains seed tracks for each challenge playlist
    seed_tracks = {}
    for playlist in challenge["playlists"]:
        track_uris = [track["track_uri"] for track in playlist["tracks"]]
        seed_tracks[playlist["pid"]] = set(track_uris)

    found_pids = set()

    if error_count > 0:
        return error_count

    f = open(submission_path)
    for line_no, line in enumerate(f):
        line = line.strip()
        if not line:
            continue
        if line[0] == "#":
            continue

        if not has_team_info:
            if line.startswith("team_info"):
                has_team_info = True
                tinfo = line.split(",")
            else:
                print("missing team_info at line", line_no)
                error_count += 1

        else:
            fields = line.split(",")
            fields = [f.strip() for f in fields]
            try:
                pid = int(fields[0])
            except ValueError:
                print("bad pid (should be an integer)", fields[0], "at line", line_no)
                error_count += 1
                continue
            tracks = fields[1:]
            found_pids.add(pid)
            if not pid in pids:
                print("bad pid", pid, "at line", line_no)
                error_count += 1
            if len(tracks) != NTRACKS:
                print(
                    "wrong number of tracks, found",
                    len(tracks),
                    "should have",
                    NTRACKS,
                    "at",
                    line_no,
                )
                error_count += 1
            if len(set(tracks)) != NTRACKS:
                print(
                    "wrong number of unique tracks, found",
                    len(set(tracks)),
                    "should have",
                    NTRACKS,
                    "at",
                    line_no,
                )
                error_count += 1

            if seed_tracks[pid].intersection(set(tracks)):
                print(
                    "found seed tracks in the submission for playlist",
                    pid,
                    "at",
                    line_no,
                )
                error_count += 1

            for uri in tracks:
                if not is_track_uri(uri):
                    print("bad track uri", uri, "at", line_no)
                    error_count += 1

    if len(found_pids) != len(pids):
        print(
            "wrong number of playlists, found", len(found_pids), "expected", len(pids)
        )
        error_count += 1

    return error_count


def is_track_uri(uri):
    fields = uri.split(":")
    return (
        len(fields) == 3
        and fields[0] == "spotify"
        and fields[1] == "track"
        and len(fields[2]) == 22
    )


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("usage: python verify_submission.py challenge_set.json submission.csv")
        sys.exit()
    errors = verify_submission("challenge/challenge_set.json", "Submission_1.csv")
    if errors == 0:
        print(
            "Submission is OK! Remember to gzip your submission before submitting it to the challenge."
        )
    else:
        print(
            "Your submission has",
            errors,
            "errors. If you submit it, it will be rejected.",
        )
