In [36]:
#Data analysis & manipulation
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import MinMaxScaler

#Data computation
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import GaussianMixture

#Data visualization
import matplotlib.pyplot as plt

#Model evaluation
from sklearn.metrics import silhouette_score

#Utils
from tqdm.notebook import tqdm

In [37]:
spotify = pd.read_csv('./dataset/spotify_tracks.csv')
spotify.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


We do not want the track_id column

In [38]:
spotify.drop(['track_id'], axis=1, inplace=True)

Let's transform the categorical into numerical variables

In [39]:
#The musical key scale can be represented as an ordinal variable
musical_scale = {
    'C' :   1.,
    'C#':   1.5,
    'D' :   2.,
    'D#':   2.5,
    'E' :   3.,
    'F' :   4.,
    'F#':   4.5,
    'G' :   5.,
    'G#':   5.5,
    'A' :   6.,
    'A#':   6.5,
    'B' :   7.
}

spotify['key'] = spotify['key'].apply(lambda k: musical_scale[k])

In [40]:
def dummify(dataset, columns):
    for c in columns:
        dataset = dataset.join(pd.get_dummies(dataset[c], drop_first=True))
        dataset.drop([c], axis=1, inplace=True)

    return dataset

spotify = dummify(spotify, ['mode', 'time_signature'])

In [41]:
spotify.head(0)

Unnamed: 0,genre,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,valence,Minor,1/4,3/4,4/4,5/4


In [42]:
dataset = spotify.loc[:, spotify.columns[3:]]

Normalizing the variables

In [43]:
scaler = MinMaxScaler().fit(dataset)
dataset = scaler.transform(dataset)

Defining the metrics function

In [46]:
def elbow_metric(dataset, model, range_): 
    elb = []
    for k in tqdm(range_):
        model = model(n_clusters=k).fit(dataset)
        
        #Calculating shoulder metric
        centroids = model.cluster_centers_
        pred_clusters = model.predict(dataset)
        curr_elb = 0

        for i in range(len(dataset)):
            curr_center = centroids[pred_clusters[i]]
            curr_elb += (dataset[i, 0] - curr_center[0]) ** 2 + (dataset[i, 1] - curr_center[1]) ** 2

        elb.append(curr_elb)
    return elb

def silhouette_metric(dataset, model, range_):
    sil = []
    #C_H_ind = []
    #D_B_ind = []

    for k in tqdm(range_):
        labels = model(n_clusters=k).fit_predict(dataset)

        sil.append(silhouette_score(dataset, labels))
        #C_H_ind.append(calinski_harabasz_score(X[:5000, :], labels[:5000]))
        #D_B_ind.append(davies_bouldin_score(X[:5000, :], labels[:5000]))
    return sil

# Hierarchical clustering

# Partitional clustering

Using K-Means, actually MiniBatchKMeans

# Probabilistic clustering