In [1]:
# Read json file
import json
import numpy as np
import requests
import os
import pandas as pd
from sklearn import preprocessing
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import warnings

# Dataframe

In [2]:
playlists = json.load(open('playlists.json', 'r'))
tracks = json.load(open('track_features.json', 'r'))

In [115]:
playlists_df = pd.read_csv('playlists.csv')
playlists_df = playlists_df.drop(['key'], axis=1)

# Train test split
np.random.seed(42)
msk = np.random.rand(len(playlists_df)) < 0.8
playlists_df, test_playlists_df = playlists_df[msk], playlists_df[~msk]

headers = playlists_df.columns.values.tolist()

print(headers)

mean = playlists_df[headers[2:]].mean(axis=0)
std = playlists_df[headers[2:]].std(axis=0)

playlists_df[headers[2:]] = (playlists_df[headers[2:]] - mean) / std
test_playlists_df[headers[2:]] = (test_playlists_df[headers[2:]] - mean) / std

cluster_headers = headers[2:]

['index', 'title', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']


In [116]:
# playlists_df = pd.read_csv('playlists_embeddings.csv')
# cols = ['danceability', 'energy', 'loudness', 'mode', 'speechiness',
#         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
#         'duration_ms', 'time_signature']
# playlists_df = playlists_df.drop(cols + ['key'], axis=1)

# # Train test split
# np.random.seed(42)
# msk = np.random.rand(len(playlists_df)) < 0.8
# playlists_df, test_playlists_df = playlists_df[msk], playlists_df[~msk]

# headers = playlists_df.columns.values.tolist()

# print(headers)

# mean = playlists_df[headers[2:]].mean(axis=0)
# std = playlists_df[headers[2:]].std(axis=0)

# playlists_df[headers[2:]] = (playlists_df[headers[2:]] - mean) / std
# test_playlists_df[headers[2:]] = (test_playlists_df[headers[2:]] - mean) / std

# cluster_headers = headers[2:]

# Clustering

### K-means

In [117]:
def get_cluster_tracks(playlists_df, cluster_preds, num_clusters, verbose=False):
    global playlists, tracks
    
    playlists_clustered = playlists_df.get(['index', 'title']).copy()
    playlists_clustered['cluster'] = cluster_preds

    cluster_tracks = [{} for _ in range(num_clusters)]
    for cluster in range(num_clusters):
        for i in playlists_clustered[playlists_clustered['cluster'] == cluster]['index']:
            for track in playlists[i][1]:
                if track['track_uri'] not in cluster_tracks[cluster]:
                    cluster_tracks[cluster][track['track_uri']] = track
        cluster_tracks[cluster] = np.array(list(cluster_tracks[cluster].values()))

        if verbose:
            print('Cluster {}: {} tracks'.format(cluster, len(cluster_tracks[cluster])))

    return cluster_tracks

def cluster_custom_score(playlists_df, cluster_preds, num_clusters):
    global tracks
    
    cluster_tracks = get_cluster_tracks(playlists_df, cluster_preds, num_clusters)

    lengths = np.array([len(tracks) for tracks in cluster_tracks])

    return np.sum(lengths) / len(tracks), \
        np.max(lengths) / len(tracks), \
        np.std(lengths) / np.mean(lengths)

In [118]:
NUM_CLUSTERS = 15
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init='auto').fit(playlists_df[headers[2:]])

In [119]:
# Group playlists by cluster
playlists_df['cluster'] = kmeans.labels_
playlists_clustered = playlists_df.get(['index', 'title', 'cluster'])

In [120]:
# Count most common titles
def count_words(titles):
    words = []
    for title in titles:
        title = str(title).strip().lower()
        words += title.split(" ")
    return Counter(words)


for i in range(NUM_CLUSTERS):
    print(count_words(
        playlists_clustered[playlists_clustered['cluster'] == i]['title'].values.tolist()).most_common(10))

[('chill', 213), ('summer', 121), ('music', 83), ('good', 83), ('2017', 82), ('2016', 78), ('jams', 76), ('new', 70), ('songs', 64), ('playlist', 59)]
[('christmas', 188), ('disney', 101), ('chill', 70), ('songs', 44), ('music', 43), ('the', 42), ('oldies', 34), ('sleep', 29), ('feels', 29), ('musicals', 23)]
[('rap', 133), ('hop', 79), ('hip', 73), ('old', 59), ('school', 46), ('workout', 23), ('eminem', 22), ('the', 20), ('good', 19), ('throwback', 18)]
[('chill', 40), ('jazz', 16), ('house', 15), ('good', 12), ('2016', 11), ('music', 10), ('the', 10), ('2015', 10), ('electronic', 9), ('study', 8)]
[('classical', 57), ('music', 30), ('study', 29), ('instrumental', 19), ('piano', 16), ('sleep', 16), ('jazz', 13), ('the', 9), ('christmas', 9), ('of', 8)]
[('rock', 166), ('workout', 113), ('edm', 69), ('music', 52), ('playlist', 41), ('my', 39), ('metal', 39), ('new', 31), ('songs', 31), ('gym', 29)]
[('country', 754), ('rock', 142), ('summer', 131), ('songs', 76), ('music', 75), ('play

# Test data

In [121]:
test_playlists_df['cluster'] = kmeans.predict(test_playlists_df[headers[2:]])
test_playlists_clustered = test_playlists_df.get(['index', 'title', 'cluster'])

for i in range(NUM_CLUSTERS):
    print(count_words(
        test_playlists_clustered[test_playlists_clustered['cluster'] == i]['title'].values.tolist()).most_common(10))

[('chill', 56), ('summer', 30), ('2017', 27), ('new', 26), ('vibes', 22), ('good', 19), ('songs', 18), ('2016', 17), ('playlist', 15), ('music', 15)]
[('christmas', 53), ('chill', 24), ('disney', 20), ('songs', 14), ('music', 11), ('sleep', 10), ('sad', 8), ('2017', 7), ('slow', 7), ('oldies', 6)]
[('rap', 26), ('hop', 12), ('old', 12), ('hip', 11), ('school', 9), ('party', 7), ('throwback', 6), ('chill', 5), ('gym', 5), ('workout', 5)]
[('chill', 8), ('house', 4), ('jazz', 4), ('2016', 4), ('playlist', 3), ('2017', 3), ('electronic', 3), ('summer', 3), ('2015', 3), ('music', 3)]
[('study', 14), ('music', 11), ('classical', 10), ('instrumental', 6), ('sleep', 5), ('piano', 5), ('time', 4), ('the', 4), ('movie', 3), ('soundtracks', 3)]
[('workout', 34), ('rock', 32), ('edm', 24), ('music', 13), ('mix', 12), ('songs', 12), ('punk', 8), ('up', 7), ('gym', 7), ('playlist', 7)]
[('country', 182), ('rock', 37), ('summer', 31), ('good', 19), ('new', 17), ('music', 16), ('the', 16), ('playlist

# Get recommendations (for test playlists)

K-nn is always based on Spotify features

In [122]:
playlists_clusters = playlists_df['cluster'].copy()
test_playlists_clusters = test_playlists_df['cluster'].copy()

playlists_df = pd.read_csv('playlists.csv')
playlists_df = playlists_df.drop(['key'], axis=1)
cols = ['danceability', 'energy', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'time_signature']
playlists_df, test_playlists_df = playlists_df[msk], playlists_df[~msk]

headers = playlists_df.columns.values.tolist()

print(headers)

mean = playlists_df[headers[2:]].mean(axis=0)
std = playlists_df[headers[2:]].std(axis=0)

playlists_df[headers[2:]] = (playlists_df[headers[2:]] - mean) / std
test_playlists_df[headers[2:]] = (test_playlists_df[headers[2:]] - mean) / std

playlists_df = pd.concat((playlists_df, playlists_clusters), axis=1)
test_playlists_df = pd.concat((test_playlists_df, test_playlists_clusters), axis=1)

['index', 'title', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']


In [123]:
mean = mean[:len(cols)]
std = std[:len(cols)]

The challenge for the **input** only withholds a constant number of tracks. For simplicity, **we just withhold half the playlist.**

The challenge **output** requires a **list of 500 recommended candidate tracks**, ordered by relevance in decreasing order. We omit the ordering since we do not evaluate the ordering.

In [124]:
cluster_tracks = get_cluster_tracks(playlists_df, playlists_df['cluster'], NUM_CLUSTERS)

PART_PERCENT = 0.5 # Percentage of playlist to use for clustering

In [125]:
def get_playlist_features(playlist_tracks, mean=mean, std=std):
    features = [np.mean([track[col] for track in playlist_tracks]) for col in cols]
    return np.array((features - mean) / std)

def get_track_info(tracks):
    return np.array([[s['artist_name'], s['track_name']] for s in tracks])

def get_track_features(tracks, mean=mean, std=std, hashmap=False):
    if hashmap:
        features = {uri: [tracks[uri][col] for col in cols] for uri in tracks}
        return {uri: np.array((f - mean) / std) for uri, f in features.items()}

    features = np.array([[s[col] for col in cols] for s in tracks])
    return np.array((features - np.array(mean)) / np.array(std))

In [126]:
def k_nn(k, needle_features, haystack_features):
    """
    Given an instance of features, find the k nearest neighbors in the haystack. 
    Return indexes within haystack.
    """
    distances = np.linalg.norm(needle_features - haystack_features, axis=1)
    return np.argsort(distances)[:k]

In [127]:
def r_precision(pred_tracks: set, target_tracks: set):
    return len(pred_tracks.intersection(target_tracks)) / len(target_tracks)

K-nn to find nearest playlists (then randomly sample tracks)

In [128]:
def nearest_playlists_predictions(init_tracks, target_tracks, playlist_part_features, part_cluster):
    pred_tracks = init_tracks.copy()
    
    haystack_playlists = playlists_df[playlists_df['cluster'] == part_cluster] 
    haystack_features = haystack_playlists[cols].values

    for i in k_nn(500 + len(init_tracks), playlist_part_features, haystack_features):
        pred_playlist = haystack_playlists.iloc[i]
        i = pred_playlist['index']
        
        for uri in [track['track_uri'] for track in playlists[i][1]]:
            pred_tracks.add(uri)

            if len(pred_tracks) >= 500 + len(init_tracks):
                break

        if len(pred_tracks) >= 500 + len(init_tracks):
            break

    pred_tracks = pred_tracks - init_tracks

    return r_precision(pred_tracks, target_tracks)

K-NN to find tracks nearest to playlist aggregate features

In [129]:
def nearest_aggregate_predictions(init_tracks, target_tracks, playlist_part_features, tracks_in_cluster, track_features):
    pred_tracks = init_tracks.copy()
    
    for i in k_nn(500 + len(init_tracks), playlist_part_features, track_features):
        pred_tracks.add(tracks_in_cluster[i]['track_uri'])

        if len(pred_tracks) >= 500 + len(init_tracks):
            break

    pred_tracks = pred_tracks - init_tracks

    return r_precision(pred_tracks, target_tracks)

K-NN to find tracks nearest to random tracks in playlist

In [130]:
def nearest_track_predictions(init_tracks, target_tracks, playlist_part_tracks, tracks_in_cluster, track_features, all_track_features_dict):
    pred_tracks = init_tracks.copy()
    
    playlist_track_features = [all_track_features_dict[track['track_uri']] for track in playlist_part_tracks]
    playlist_track_nn = [k_nn(len(track_features), features, track_features) for features in playlist_track_features] # bottleneck

    for i in range(500):
        for track_nn in playlist_track_nn:
            pred_tracks.add(tracks_in_cluster[track_nn[i]]['track_uri'])

            if len(pred_tracks) >= 500 + len(init_tracks):
                break

        if len(pred_tracks) >= 500 + len(init_tracks):
            break

    pred_tracks = pred_tracks - init_tracks

    return r_precision(pred_tracks, target_tracks)

In [131]:
cluster_track_features = [get_track_features(cluster_tracks[i]) for i in range(NUM_CLUSTERS)]
all_track_features_dict = get_track_features(tracks, hashmap=True)

In [132]:
playlists_predictions_score = 0
aggregate_predictions_score = 0
track_predictions_score = 0

num_test = len(test_playlists_df) // 100

for test_i in range(num_test):

    i = test_playlists_df.iloc[test_i]['index']

    playlist_name = playlists[i][0]
    playlist_tracks = playlists[i][1]
    np.random.shuffle(playlist_tracks)

    playlist_part_tracks = playlist_tracks[:int(
        len(playlists[i][1]) * PART_PERCENT)]

    playlist_part_features = get_playlist_features(playlist_part_tracks)

    playlist_features = np.array(
        test_playlists_df.iloc[test_i][headers[2:]].values, dtype='float32')

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        part_cluster = kmeans.predict([test_playlists_df.iloc[test_i][cluster_headers]])[0]

    tracks_in_cluster = cluster_tracks[part_cluster]
    track_features = cluster_track_features[part_cluster]

    init_tracks = set([track['track_uri'] for track in playlist_part_tracks])
    target_tracks = set([track['track_uri']
                        for track in playlist_tracks]) - init_tracks

    playlists_predictions_score += nearest_playlists_predictions(
        init_tracks, target_tracks, playlist_part_features, part_cluster)

    aggregate_predictions_score += nearest_aggregate_predictions(
        init_tracks, target_tracks, playlist_part_features, tracks_in_cluster, track_features)

    track_predictions_score += nearest_track_predictions(
        init_tracks, target_tracks, playlist_part_tracks, tracks_in_cluster, track_features, all_track_features_dict)
    
print(f'{100 * playlists_predictions_score / num_test:.2f}%')
print(f'{100 * aggregate_predictions_score / num_test:.2f}%')
print(f'{100 * track_predictions_score / num_test:.2f}%')

20.98%
3.02%
2.90%
