# Dataset Preprocess

In [None]:
import os
import sys
import gzip
import json
import pickle as pkl
from tqdm import tqdm
from scipy.sparse import lil_matrix

In [None]:
data_dir = 'data'
mpd_dir = os.path.join(data_dir, 'mpd')
fchallenge_json = os.path.join(data_dir, 'challenge_set.json')
fchallenge = os.path.join(data_dir, 'challenge_set.pkl.gz')
fsongs = os.path.join(data_dir, 'songs_mpd.pkl.gz')
fsong2pop = os.path.join(data_dir, 'song2pop_mpd.pkl.gz')
fmftrain = os.path.join(data_dir, 'mftrain_train.pkl.gz')

## Load MPD

In [None]:
all_playlists = []
all_songs = dict()

In [None]:
for name in tqdm(sorted(os.listdir(mpd_dir))):
    assert name.startswith('mpd.slice.') and name.endswith('.json')
    fname = os.path.join(mpd_dir, name)
    with open(fname, 'r') as fd:
        mpd_slice = json.loads(fd.read())
        for pl in mpd_slice['playlists']:
            playlist = dict()
            assert 'pid' in pl
            playlist['pid'] = pl['pid']
            playlist['name'] = pl['name'] if 'name' in pl else ''
            playlist['collaborative'] = int(bool(pl['collaborative'])) if 'collaborative' in pl else 0
            playlist['modified_at'] = pl['modified_at'] if 'modified_at' in pl else 0
            playlist['num_albums'] = pl['num_albums'] if 'num_albums' in pl else 0
            playlist['num_followers'] = pl['num_followers'] if 'num_followers' in pl else 0
            playlist['num_edits'] = pl['num_edits'] if 'num_edits' in pl else 0
            playlist['num_artists'] = pl['num_artists'] if 'num_artists' in pl else 0
            playlist['duration_ms'] = pl['duration_ms'] if 'duration_ms' in pl else 0
            playlist['description'] = pl['description'] if 'description' in pl else ''
            
            assert 'num_tracks' in pl
            assert 'tracks' in pl
            assert pl['num_tracks'] == len(pl['tracks'])
            
            num_tracks = pl['num_tracks']
            tracks = [None for _ in range(num_tracks)]
            for t in pl['tracks']:
                assert 'track_uri' in t
                assert 'pos' in t
                sid = t['track_uri'].split(':')[-1]  # discard prefix 'spotify:track:'
                tracks[t['pos']] = sid
                if sid not in all_songs:
                    track_info = dict()
                    track_info['track_name'] = t['track_name'] if 'track_name' in t else ''
                    track_info['duration_ms'] = t['duration_ms'] if 'duration_ms' in t else 0
                    track_info['artist_name'] = t['artist_name'] if 'artist_name' in t else ''
                    track_info['artist_uri'] = t['artist_uri'].split(':')[-1] \
                                               if 'artist_uri' in t else ''  # discard prefix 'spotify:artist:'
                    track_info['album_uri'] = t['album_uri'].split(':')[-1] \
                                              if 'album_uri' in t else ''    # discard prefix 'spotify:album:'
                    track_info['album_name'] = t['album_name'] if 'album_name' in t else ''
                    all_songs[sid] = track_info

            playlist['tracks'] = tracks
            all_playlists.append(playlist)

Number of playlists: 1,000,000

In [None]:
print('#playlists: {:,}'.format(len(all_playlists)))

Number of unique tracks: 2,262,292

In [None]:
print('#tracks: {:,}'.format(len(all_songs)))

Save to file

In [None]:
pkl.dump(all_songs, gzip.open(fsongs, 'wb'))

## Song popularity

In [None]:
song2pop = dict()

In [None]:
for pl in tqdm(all_playlists):
    for track in pl['tracks']:
        try:
            song2pop[track] += 1
        except KeyError:
            song2pop[track] = 1

Save to file

In [None]:
pkl.dump(song2pop, gzip.open(fsong2pop, 'wb'))

## Load challenge set

Organise playlists according to task types:
1. Title only
1. Title + first 1 track
1. Title + first 5 tracks
1. Title + first 10 tracks
1. Title + first 25 tracks
1. Title + first 100 tracks
1. First 5 tracks
1. First 10 tracks
1. Title + 25 random tracks
1. Title + 100 random tracks

In [None]:
tasks_dict = {task: [] for task in range(1, 11)}

In [None]:
title_cnt = 0
with gzip.open(fchallenge_json, 'r') as fd:
    challenge_playlists = json.loads(fd.read())
    for pl in challenge_playlists['playlists']:
        assert 'num_samples' in pl
        assert 'tracks' in pl
        track_ix = sorted([t['pos'] for t in pl['tracks']])
        pl['tracks'] = [t['track_uri'].split(':')[-1] for t in pl['tracks']]  # discard prefix 'spotify:track:'
        if 'name' in pl:
            title_cnt += 1
            if pl['num_samples'] == 0:
                tasks_dict[1].append(pl)
            elif pl['num_samples'] == 1:
                tasks_dict[2].append(pl)
            elif pl['num_samples'] == 5:
                tasks_dict[3].append(pl)
            elif pl['num_samples'] == 10:
                tasks_dict[4].append(pl)
            elif pl['num_samples'] == 25:
                if np.all(np.array(track_ix) == np.arange(25)):
                    tasks_dict[5].append(pl)
                else:
                    tasks_dict[9].append(pl)
            else:
                assert pl['num_samples'] == 100
                if np.all(np.array(track_ix) == np.arange(100)):
                    tasks_dict[6].append(pl)
                else:
                    tasks_dict[10].append(pl)
        else:
            if pl['num_samples'] == 5:
                tasks_dict[7].append(pl)
            else:
                assert pl['num_samples'] == 10
                tasks_dict[8].append(pl)

Number of partial playlists with title

In [None]:
print(title_cnt)

Each task should have 1,000 playlists

In [None]:
for i in range(1, 11):
    print('#Playlists for task {:2d}: {:,}'.format(i, len(tasks_dict[i])))

Save to file

In [None]:
pkl.dump(tasks_dict, gzip.open(ftest_challenge, 'wb'))

## Train data for matrix factorisation

In [None]:
song2index = {sid: ix for ix, sid in enumerate(sorted(all_songs))}

In [None]:
Y = lil_matrix((len(all_playlists), len(song2index)), dtype=np.bool)
for i in tqdm(range(len(all_playlists))):
    pl = all_playlists[i]
    indices = [song2index[sid] for sid in pl['tracks']]
    Y[i, indices] = 1
Y = Y.tocsr()

Save to file

In [None]:
pkl.dump([Y, song2index], gzip.open(fmftrain, 'wb'))