In [40]:
# Read json file
import json
import numpy as np
import requests
import os
import pandas as pd
from sklearn import preprocessing
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import warnings

# Dataframe

In [41]:
playlists_df = pd.read_csv('playlists.csv')
playlists_df = playlists_df.drop(['key'], axis=1)

# Train test split
np.random.seed(42)
msk = np.random.rand(len(playlists_df)) < 0.8
playlists_df, test_playlists_df = playlists_df[msk], playlists_df[~msk]

headers = playlists_df.columns.values.tolist()

print(headers)

['index', 'title', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']


In [42]:
mean = playlists_df[headers[2:]].mean(axis=0)
std = playlists_df[headers[2:]].std(axis=0)

playlists_df[headers[2:]] = (playlists_df[headers[2:]] - mean) / std
test_playlists_df[headers[2:]] = (test_playlists_df[headers[2:]] - mean) / std

In [43]:
playlists = json.load(open('playlists.json', 'r'))
tracks = json.load(open('track_features.json', 'r'))

# Clustering

### K-means

In [44]:
def get_cluster_tracks(playlists_df, cluster_preds, num_clusters, verbose=False):
    global playlists, tracks
    
    playlists_clustered = playlists_df.get(['index', 'title']).copy()
    playlists_clustered['cluster'] = cluster_preds

    cluster_tracks = [{} for _ in range(num_clusters)]
    for cluster in range(num_clusters):
        for i in playlists_clustered[playlists_clustered['cluster'] == cluster]['index']:
            for track in playlists[i][1]:
                if track['track_uri'] not in cluster_tracks[cluster]:
                    cluster_tracks[cluster][track['track_uri']] = track
        cluster_tracks[cluster] = np.array(list(cluster_tracks[cluster].values()))

        if verbose:
            print('Cluster {}: {} tracks'.format(cluster, len(cluster_tracks[cluster])))

    return cluster_tracks

def cluster_custom_score(playlists_df, cluster_preds, num_clusters):
    global tracks
    
    cluster_tracks = get_cluster_tracks(playlists_df, cluster_preds, num_clusters)

    lengths = np.array([len(tracks) for tracks in cluster_tracks])

    return np.sum(lengths) / len(tracks), \
        np.max(lengths) / len(tracks), \
        np.std(lengths) / np.mean(lengths)

In [45]:
NUM_CLUSTERS = 15
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init='auto').fit(playlists_df[headers[2:]])

In [46]:
# Group playlists by cluster
playlists_df['cluster'] = kmeans.labels_
playlists_clustered = playlists_df.get(['index', 'title', 'cluster'])

In [47]:
# Count most common titles
def count_words(titles):
    words = []
    for title in titles:
        title = str(title).strip().lower()
        words += title.split(" ")
    return Counter(words)


for i in range(NUM_CLUSTERS):
    print(count_words(
        playlists_clustered[playlists_clustered['cluster'] == i]['title'].values.tolist()).most_common(10))

[('chill', 213), ('summer', 121), ('music', 83), ('good', 83), ('2017', 82), ('2016', 78), ('jams', 76), ('new', 70), ('songs', 64), ('playlist', 59)]
[('christmas', 188), ('disney', 101), ('chill', 70), ('songs', 44), ('music', 43), ('the', 42), ('oldies', 34), ('sleep', 29), ('feels', 29), ('musicals', 23)]
[('rap', 133), ('hop', 79), ('hip', 73), ('old', 59), ('school', 46), ('workout', 23), ('eminem', 22), ('the', 20), ('good', 19), ('throwback', 18)]
[('chill', 40), ('jazz', 16), ('house', 15), ('good', 12), ('2016', 11), ('music', 10), ('the', 10), ('2015', 10), ('electronic', 9), ('study', 8)]
[('classical', 57), ('music', 30), ('study', 29), ('instrumental', 19), ('piano', 16), ('sleep', 16), ('jazz', 13), ('the', 9), ('christmas', 9), ('of', 8)]
[('rock', 166), ('workout', 113), ('edm', 69), ('music', 52), ('playlist', 41), ('my', 39), ('metal', 39), ('new', 31), ('songs', 31), ('gym', 29)]
[('country', 754), ('rock', 142), ('summer', 131), ('songs', 76), ('music', 75), ('play

# Test data

In [48]:
test_playlists_df['cluster'] = kmeans.predict(test_playlists_df[headers[2:]])
test_playlists_clustered = test_playlists_df.get(['index', 'title', 'cluster'])

for i in range(NUM_CLUSTERS):
    print(count_words(
        test_playlists_clustered[test_playlists_clustered['cluster'] == i]['title'].values.tolist()).most_common(10))

[('chill', 56), ('summer', 30), ('2017', 27), ('new', 26), ('vibes', 22), ('good', 19), ('songs', 18), ('2016', 17), ('playlist', 15), ('music', 15)]
[('christmas', 53), ('chill', 24), ('disney', 20), ('songs', 14), ('music', 11), ('sleep', 10), ('sad', 8), ('2017', 7), ('slow', 7), ('oldies', 6)]
[('rap', 26), ('hop', 12), ('old', 12), ('hip', 11), ('school', 9), ('party', 7), ('throwback', 6), ('chill', 5), ('gym', 5), ('workout', 5)]
[('chill', 8), ('house', 4), ('jazz', 4), ('2016', 4), ('playlist', 3), ('2017', 3), ('electronic', 3), ('summer', 3), ('2015', 3), ('music', 3)]
[('study', 14), ('music', 11), ('classical', 10), ('instrumental', 6), ('sleep', 5), ('piano', 5), ('time', 4), ('the', 4), ('movie', 3), ('soundtracks', 3)]
[('workout', 34), ('rock', 32), ('edm', 24), ('music', 13), ('mix', 12), ('songs', 12), ('punk', 8), ('up', 7), ('gym', 7), ('playlist', 7)]
[('country', 182), ('rock', 37), ('summer', 31), ('good', 19), ('new', 17), ('music', 16), ('the', 16), ('playlist

# Get recommendations

Using the k-means cluster (on test playlists)

In [49]:
cluster_tracks = get_cluster_tracks(playlists_df, playlists_df['cluster'], NUM_CLUSTERS)

In [50]:
cols = ['danceability', 'energy', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'time_signature']

def get_playlist_features(playlist_tracks, mean=mean, std=std):
    features = [np.mean([track[col] for track in playlist_tracks]) for col in cols]
    return np.array((features - mean) / std)

def get_track_info(tracks):
    return np.array([[s['artist_name'], s['track_name']] for s in tracks])

def get_track_features(tracks, mean=mean, std=std):
    features = np.array([[s[col] for col in cols] for s in tracks])
    return np.array((features - np.array(mean)) / np.array(std))

pretty_print = lambda x: "{} - {}".format(x[0], x[1])

Accuracy of clustering

In [51]:
PART_PERCENT = 0.5 # Percentage of playlist to use for clustering

count = 0
for test_i in range(len(test_playlists_df)):
    i = test_playlists_df.iloc[test_i]['index']

    playlist_tracks = playlists[i][1]
    np.random.shuffle(playlist_tracks) # Shuffle tracks - improved accuracy

    playlist_part_features = get_playlist_features(
        playlist_tracks[:int(len(playlist_tracks) * PART_PERCENT)])

    playlist_features = np.array(test_playlists_df.iloc[test_i][headers[2:]].values, dtype='float32')

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        real_cluster = kmeans.predict([playlist_features])[0]
        part_cluster = kmeans.predict([playlist_part_features])[0]

    count += 1 if real_cluster == part_cluster else 0

print('Accuracy by clustering using only {}% of the playlist: {:.2f}%'.format(
    PART_PERCENT*100, 100*count / len(test_playlists_df)))

Accuracy by clustering using only 50.0% of the playlist: 79.73%


Sample random playlist (from test)

In [73]:
test_i = np.random.randint(len(test_playlists_df))
i = test_playlists_df.iloc[test_i]['index']

playlist_name = playlists[i][0]
playlist_tracks = playlists[i][1]
np.random.shuffle(playlist_tracks)

playlist_part_tracks = playlist_tracks[:int(len(playlists[i][1]) * PART_PERCENT)]

playlist_part_features = get_playlist_features(playlist_part_tracks)

playlist_features = np.array(test_playlists_df.iloc[test_i][headers[2:]].values, dtype='float32')

num_tracks = len(playlists[i][1])

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    real_cluster = kmeans.predict([playlist_features])[0]
    part_cluster = kmeans.predict([playlist_part_features])[0]

tracks_in_cluster = cluster_tracks[part_cluster]
tracks_info = get_track_info(tracks_in_cluster)
track_features = get_track_features(tracks_in_cluster)

print("Playlist name: {}".format(playlist_name))
print("Number of tracks: {}".format(num_tracks))
print("Real cluster: {}".format(real_cluster))
print("Part cluster: {}".format(part_cluster))
print("Tracks: ")

print('\n'.join(map(pretty_print, get_track_info(playlist_tracks[:10]))))

Playlist name: Slow jamz
Number of tracks: 127
Real cluster: 9
Part cluster: 9
Tracks: 
Jay Sean - Down
Kanye West - Gold Digger
Plies - Put It On Ya (feat. Chris J)
T.I. - Dead And Gone (feat. Justin Timberlake)
The-Dream - IV Play
David Guetta - Nothing Really Matters (feat. will.i.am) - feat. will.i.am
Kanye West - Dark Fantasy
Hunter Hayes - I Want Crazy - Encore
B.o.B - So Hard To Breathe
Alicia Keys - New Day


In [74]:
def k_nn(k, needle_features, haystack_features):
    """
    Given an instance of features, find the k nearest neighbors in the haystack. 
    Return indexes within haystack.
    """
    distances = np.linalg.norm(needle_features - haystack_features, axis=1)
    return np.argsort(distances)[:k]

K-nn to find nearest playlist (then randomly sample tracks)

In [75]:
# playlists within a cluster
haystack_playlists = playlists_df[playlists_df['cluster'] == part_cluster]
haystack_features = haystack_playlists[headers[2:]].values

i = k_nn(10, playlist_part_features, haystack_features)[0]
pred_playlist = haystack_playlists.iloc[i]

i = pred_playlist['index']

pred_playlist_name = playlists[i][0]
pred_playlist_tracks = playlists[i][1]
np.random.shuffle(pred_playlist_tracks)
pred_num_tracks = len(playlists[i][1])

print("Playlist name: {}".format(pred_playlist_name))
print("Number of tracks: {}".format(pred_num_tracks))
print("Tracks: ")

print('\n'.join(map(pretty_print, get_track_info(pred_playlist_tracks[:10]))))

Playlist name: summer feels
Number of tracks: 70
Tracks: 
J. Cole - No Role Modelz
Circa Waves - T-Shirt Weather
Childish Gambino - IV. sweatpants
Red Hot Chili Peppers - Dani California
Sage The Gemini - Now and Later
Fall Out Boy - Thnks fr th Mmrs
K.Flay - FML
K.Flay - Make Me Fade
Aer - Floats My Boat
MGMT - Electric Feel


K-NN to find tracks nearest to playlist aggregate features

In [76]:
playlist_part_features_pred = [tracks_info[i] for i in k_nn(10, playlist_part_features, track_features)]
print("Playlist part feature predictions: \n{}\n".format("\n".join(map(pretty_print, playlist_part_features_pred))))

# playlist_features_pred = [tracks_info[i] for i in k_nn(10, playlist_features, track_features)]
# print("Playlist feature predictions: \n{}".format("\n".join(map(pretty_print, playlist_features_pred))))

Playlist part feature predictions: 
French Montana - Unforgettable - Tiësto vs. Dzeko AFTR:HRS Remix
Selena Gomez - Fetish (feat. Gucci Mane) - Galantis Remix
Lucy Woodward - He Got Away
Victor Muñoz - Mi Felicidad
Jennifer Hudson - Where You At
ODESZA - Late Night
Madonna - Living For Love
Rudimental - Lay It All On Me (feat. Ed Sheeran)
Glee Cast - Born This Way (Glee Cast Version)
Cobra Starship - #1Nite [One Night]



K-NN to find tracks nearest to random tracks in playlist

In [77]:
random_tracks = np.random.choice(playlist_part_tracks, 10, replace=False)

random_tracks_info = get_track_info(random_tracks)
random_track_features = get_track_features(random_tracks)

random_tracks_pred = []
for features in random_track_features:
    random_tracks_pred.append(tracks_info[k_nn(2, features, track_features)[1]])

for random_track, random_track_pred in zip(random_tracks_info, random_tracks_pred):
    print("{} ->\n\t{}".format(pretty_print(random_track), pretty_print(random_track_pred)))

Chris Brown - Love More ->
	Ghostface Killah - Cherchez LaGhost
Pitbull - International Love ->
	Madden - Golden Light (feat. 6AM)
Plies - Put It On Ya (feat. Chris J) ->
	Jawga Boyz - Mudjug (Dip In My Lip)
Kelly Clarkson - My Life Would Suck Without You ->
	Big Time Rush - Halfway There
Missez - Love Song ->
	Meek Mill - Bad For You (feat. Nicki Minaj)
Jordin Sparks - Tattoo ->
	Leona Lewis - Better in Time
Hurricane Chris - Headboard ->
	Girl Talk - What It's All About
Miguel - Pussy is Mine ->
	Manu Chao - El viento
Mobb Deep - Give It To Me ->
	You Without Me - One Less Problem
Kanye West - Touch The Sky ->
	Kanye West - Drive Slow
