In [35]:
import pandas as pd
import numpy as np 
import pickle

from collections import Counter

# Data/Model Loading

In [123]:
SAMPLES_PARQUET = '../data/features/featureswithouttags.parquet'

samples = pd.read_parquet(SAMPLES_PARQUET)

feature_groups = [
    "audio_features",
    "metadata",
    "ranks",
    "social_features",
    "temporal_features",
    "album",
    "artist",
    "set",
    "track",
]

all_features = {f"{_group}" : [] for _group in feature_groups}


for feat_comb in samples.columns:
    _group = feat_comb[0]
    #print(feat_comb)
    _feature = feat_comb[1]
    all_features[_group].append(_feature)

all_features
#samples.head()

audio_features = samples['audio_features']
track_metadata = samples['metadata']
social_features = samples['social_features']
social_features.head()

Unnamed: 0_level_0,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.38899,0.38674,0.40637,0.0,0.0
3,0.38899,0.38674,0.40637,0.0,0.0
5,0.38899,0.38674,0.40637,0.0,0.0
10,0.557339,0.614272,0.798387,0.005158,0.354516
134,0.38899,0.38674,0.40637,0.0,0.0


In [37]:
MODEL_PKL = '../modeling/training/xgboost.pkl'
#loaded_model = pickle.load(open(MODEL_PKL))

model = None
with open(MODEL_PKL,'rb') as f:
    model = pickle.load(f)

# Playlist Creation
---


In [131]:
def create_playlist(tracks,social_features_list=None,random=False):
    if social_features_list is None:
        social_features_list = ['artist_discovery',
            'artist_familiarity',
            'artist_hotttnesss',
            'song_currency',
            'song_hotttnesss'
        ]
    social_features_tuple = [('social_features',_feat) for _feat in social_features_list]
    tracks = tracks.sort_values(by=social_features_tuple,ascending=False)
    return tracks[[('track','title'),('metadata','artist_name'),('track','genre_top'),('cluster','cluster')]]

## XGBoost Model

In [38]:
import xgboost as xgb
dtest_reg = xgb.DMatrix(audio_features)

In [97]:
y_test_pred = model.predict(dtest_reg)
y_test_pred

print(Counter(y_test_pred))

Counter({2.0: 9518, 1.0: 2610, 0.0: 1001})


In [121]:
clustered_samples = samples.copy()
clustered_samples[('cluster','cluster')] = y_test_pred
clustered_samples.head()

Unnamed: 0_level_0,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,...,track,track,track,track,track,track,track,track,track,cluster
Unnamed: 0_level_1,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,...,interest,language_code,license,listens,lyricist,number,publisher,tags,title,cluster
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,,...,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food,2.0
3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,,...,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave,2.0
5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,,...,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World,2.0
10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,Constant Hitmaker,...,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway,2.0
134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,,...,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music,2.0


In [184]:
tracks_for_physical = clustered_samples[clustered_samples['cluster','cluster']==2.0]
#tracks_for_physical_sorted[('track','genre_top')].value_counts()

## 	artist_discovery	artist_familiarity	artist_hotttnesss	song_currency	song_hotttnesss
## select one or more in order of likeness
create_playlist(tracks=tracks_for_physical,social_features_list=['artist_familiarity']).sample(100)
#tracks_for_physical_sorted[('track','genre_top')].value_counts()


Unnamed: 0_level_0,track,metadata,track,cluster
Unnamed: 0_level_1,title,artist_name,genre_top,cluster
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
39686,4 The Model feat. Camil - Damen,The Model,Electronic,2.0
16897,Meatje,D'r Sjaak,,2.0
124528,Wait For You,Monolog,,2.0
124765,Ninja,Indikings,,2.0
21423,Birds Dazed And Confused,Terminal 11,Electronic,2.0
...,...,...,...,...
15297,Sticks to the Skin,1.6 Band,Rock,2.0
19250,Rag Doll Physics,Diablo Swing Orchestra,,2.0
20710,Extra,The Choke,Rock,2.0
109073,Killer,The Krauts,Rock,2.0


In [179]:
tracks_for_focus = clustered_samples[clustered_samples['cluster','cluster']==1.0]

## 	artist_discovery	artist_familiarity	artist_hotttnesss	song_currency	song_hotttnesss
## select one or more in order of likeness
print(tracks_for_focus[('track','genre_top')].value_counts())
create_playlist(tracks=tracks_for_focus,social_features_list=['artist_discovery']).sample()



(track, genre_top)
Rock                   589
Folk                   310
Electronic             250
Jazz                   224
Hip-Hop                212
Instrumental            72
Pop                     66
Old-Time / Historic     50
International           40
Blues                   29
Experimental             4
Name: count, dtype: int64


Unnamed: 0_level_0,track,metadata,track,cluster
Unnamed: 0_level_1,title,artist_name,genre_top,cluster
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
14585,The Dream of Sergei Prokofiev and Waltz,Ergo Phizmiz,Instrumental,1.0


In [161]:
tracks_for_relax = clustered_samples[clustered_samples['cluster','cluster']==0.0]

## 	artist_discovery	artist_familiarity	artist_hotttnesss	song_currency	song_hotttnesss
## select one or more in order of likeness
create_playlist(tracks=tracks_for_relax,social_features_list=['artist_familiarity'])
tracks_for_relax[('track','genre_top')].value_counts()

(track, genre_top)
Classical              265
Old-Time / Historic    221
Folk                   164
Rock                    60
Electronic              35
International           14
Pop                     12
Jazz                     4
Experimental             3
Blues                    2
Hip-Hop                  2
Name: count, dtype: int64