In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from tabulate import tabulate
import seaborn as sns
import scikitplot as skplt
from numpy import percentile
import math
from numpy import arange
import IPython.display as ipd
import matplotlib.lines as mlines

#pandas
from pandas.plotting import parallel_coordinates
import datetime

#sklearn
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import learning_curve

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix, roc_curve, auc
from sklearn.metrics import roc_auc_score, precision_score, recall_score, make_scorer
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

#scikitplot
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from scikitplot.metrics import plot_lift_curve
from scikitplot.metrics import plot_cumulative_gain

#tslearn
from tslearn.metrics import dtw, dtw_path, cdist_dtw, subsequence_cost_matrix

#mlxtend
from mlxtend.plotting import plot_decision_regions

#matplotlib
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from matplotlib import pyplot

#seaborn
import seaborn as sns

#scipy
import scipy.stats as stats
from scipy.stats import shapiro, mannwhitneyu, normaltest, kstest
from scipy.stats import boxcox, yeojohnson
from scipy.spatial.distance import euclidean, cityblock, cosine, minkowski
from scipy.spatial.distance import cdist

#matrixprofile
import matrixprofile
from matrixprofile import *
from matrixprofile.discords import discords

In [53]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, silhouette_samples, davies_bouldin_score
import numpy as np
import scipy.stats as stats
import collections
import math

#pyclustering
from pyclustering.cluster import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster import cluster_visualizer_multidim

<h2>Functions</h2>

In [54]:
def visualize_clusters(df, X, list_of_columns, clusters, centers):
    
    for x in range(len(list_of_columns)):
        for z in range(x+1, len(list_of_columns)):
            
            i = df.columns.values.tolist().index(list_of_columns[x])
            j = df.columns.values.tolist().index(list_of_columns[z])

            plt.title(list_of_columns[x].title() + " - " + list_of_columns[z].title())
            plt.xlabel(list_of_columns[x])
            plt.ylabel(list_of_columns[z])
    
            for indexes in clusters:
                plt.scatter(X[indexes,i], X[indexes,j], alpha=0.4)
            for c in centers:
                plt.scatter(c[i], c[j], s=100, edgecolors='k')
                
            plt.show()

In [55]:
def control_clusters(clusters):
    somma = 0
    print("Number of clusters: {}".format(len(clusters)))
    print()
    print()
    print("Content of each cluster")
    print()
    for i, cluster in enumerate(clusters):
        print("Cluster {}: {}".format(i, len(cluster)))
        somma+=len(cluster)

    print()
    print("Total elements in all clusters: {}".format(somma))

In [56]:
def inertia(number_of_clusters, X, labels):
    
    manual_SSE = 0
    for i in range(number_of_clusters):
        cluster = X[labels == i]
        if len(cluster) > 0:
                clusterMean = cluster.mean(axis = 0)
                manual_SSE += ((cluster - clusterMean) ** 2).sum()
                
    return manual_SSE

In [57]:
def cluster_insight(df):
    
    cluster_diz = pd.DataFrame(columns = ["Studio_Recording", "Live_Recording"], \
                                index = ["Cluster {}".format(str(i)) for i in np.unique(df['Labels'])])
    
    tot_entropies = []
    dim_clusters = []
    max_cluster = 0

    for i in np.unique(df['Labels']):

        cluster_list = list(df[df['Labels'] == i]['album_type'])

        counter = collections.Counter(cluster_list)

        cluster_diz.loc["Cluster {}".format(str(i)), 'Studio_Recording'] = counter[0]
        cluster_diz.loc["Cluster {}".format(str(i)), 'Live_Recording'] = counter[1]

        #cluster_diz.loc["Cluster {}".format(str(i)), 'Tot_Cluster'] = cluster_diz.loc["Cluster {}".format(str(i)), ['Studio_Recording', 'Live_Recording']].sum()

        lista = cluster_diz.loc['Cluster {}'.format(str(i)), ['Studio_Recording', 'Live_Recording']].values.tolist()
        tot = sum(lista)
        cluster_diz.loc["Cluster {}".format(str(i)), 'Tot_Cluster'] = tot
        dim_clusters.append(tot)

        '''
        Entropia singola: -(sommatoria di (pij log2 pij) per tutte le classi all'interno di un cluster.
        --> pij = mij/mj dove mj è il numero di valori nel cluster j e mij è il numero di valori della classe i
        nel cluster j.
        '''
        entropy_single = sum([(x/tot)*math.log2(x/tot) for x in lista if x != 0])
        if entropy_single != 0.0:
            entropy_single*=-1
        cluster_diz.loc["Cluster {}".format(str(i)), 'Entropy'] = "%.4f" %(entropy_single)
        tot_entropies.append(entropy_single)


        '''
        Purity singola: etichetta della classe più frequente/numero di valori nel cluster
        '''
        purity_single = max(lista)/tot
        cluster_diz.loc["Cluster {}".format(str(i)), 'Purity'] = "%.4f" %(purity_single)

        max_cluster += max(lista)


    new_row = pd.Series(cluster_diz[['Studio_Recording', 'Live_Recording', 'Tot_Cluster']].sum(axis=0).astype(int), name = "Total")
    cluster_diz = cluster_diz.append(new_row, ignore_index=False)

    '''
    Entropia complessiva: sommatoria delle entropie di ogni cluster pesate per la dimensione di ogni cluster.
    --> (mj/m) * ej --> dove mj è la dimensione del cluster j e m è il numero totale di punti. 
    '''

    last_tot = cluster_diz.loc['Total', 'Tot_Cluster']
    tot_entropy = 0

    for entropy, dim in zip(tot_entropies, dim_clusters):
        tot_entropy+=(entropy*(dim/last_tot))

    cluster_diz.loc["Total", 'Entropy'] = "%.4f" %(tot_entropy)

    '''
    Purity complessiva: sommatoria delle classi "corrette" (il valore più alto) di ogni cluster, divisa per il
    numero totale di valori.
    '''

    tot_purity = max_cluster/last_tot
    cluster_diz.loc["Total", 'Purity'] = "%.4f" %(tot_purity)



    cluster_diz['Tot_Cluster'] = cluster_diz['Tot_Cluster'].astype(int)

    ipd.display(cluster_diz)

In [58]:
def plot_general_distributions(df):
    
    for column in df.columns:

        if column != 'Labels':

            fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (18, 5))
            cluster_list = []
            list_num_cluster = []

            for num_cluster in np.unique(df['Labels']):
                cluster_list.append(df[df['Labels'] == num_cluster][column])  #distribution
                list_num_cluster.append('Cluster %s' % num_cluster)           #num cluster

            ax[0].hist(cluster_list, label = list_num_cluster)
            ax[0].legend(title="Clusters", bbox_to_anchor = (1, 1.05), loc='upper left', \
                   frameon = True, shadow = True)
            ax[0].set_title(column + " Histogram")

            ax[1].hist(df[column], color = "#B8002E")
            ax[1].set_title(column + " Histogram (Distribution in the whole dataset)")

            fig.tight_layout()
            plt.show()

In [59]:
def analyze_single_clusters(df):
    
    for label in np.unique(df['Labels']):
        
        print("Cluster {}".format(label))
        print()
        
        Studio_0_dataset = df[(df['Labels'] == label) & (df['album_type'] == 0)]

        Live_1_dataset = df[(df['Labels'] == label) & (df['album_type'] == 1)]

        for column in df.columns:

            if (column != 'Labels' and column != 'album_type'):

                fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (18, 5), sharey = True)

                ax[0].hist(Studio_0_dataset[column], label = 'Studio_Recording', color = "#8AB800")
                ax[0].legend(loc='upper right')
                ax[0].set_title(column + " Histogram - Cluster {}".format(label))

                ax[1].hist(Live_1_dataset[column], label = 'Live_Recording', color = "#FF3D3D")
                ax[1].legend(loc='upper right')
                ax[1].set_title(column + " Histogram - Cluster {}".format(label))

                plt.show()
        print("------------------------------------------------------------------------------------------------------------------")

In [60]:
def run_xmeans_vecchio(df_to_copy, X):
    
    max_k = 30
    sse_list = list()
    silhouette = list()
    calinski = list()
    bouldin = list()
    fig, ax = plt.subplots(nrows = 2, ncols = 2, figsize = (18, 10))

    for k in range(2, max_k+1):
        
        #-----------------------------------run algorithm------------------------------------#
        amount_initial_centers = 2
        initial_centers = kmeans_plusplus_initializer(X, amount_initial_centers).initialize()

        xmeans_instance = xmeans.xmeans(X, initial_centers, k)
        xmeans_instance.process()

        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()

        
        #--------------------------retrieve cluster labels for the dataset-------------------#
        
        label_index = list(np.arange(len(clusters))) #list of labels for clusters

        diz_temp = dict()
        for label, cluster_content in zip(label_index, clusters):
            diz_temp[label] = cluster_content

        df_copy = pd.DataFrame(X, columns = df_to_copy.columns, index = df_to_copy.index)

        for label, cluster in diz_temp.items():
            for index in cluster:
                df_copy.loc[df_copy.iloc[index].name, 'Labels'] = label

        df_copy['Labels'] = df_copy['Labels'].astype(int)

        xmeans_labels = np.array(df_copy['Labels'])
    
    
        #-----------------------------------calculate metrics--------------------------------#
        
        sse = inertia(len(clusters), X, xmeans_labels)
        sse_list.append(sse)
        
        sil = silhouette_score(X, xmeans_labels)
        silhouette.append(sil)
        
        cal = calinski_harabasz_score(X, xmeans_labels)
        calinski.append(cal)
        
        boul = davies_bouldin_score(X, xmeans_labels)
        bouldin.append(boul)
        
        print("k: ", k)
        print ('SSE: ', sse)
        print ('Silhouette Score: ', sil)
        print ("Calinski - Harabaz Score: ", cal)
        print ("Davies - Bouldin Score: ", boul)
        print()
        print("---------------------------------")
        print()
        
        
    
    index = [i for i in range(2, max_k + 1)]
    
    ax[0, 0].plot(range(2, len(sse_list) + 2), sse_list)
    ax[0, 0].set_ylabel('SSE', fontsize=15)
    ax[0, 0].set_xlabel('K', fontsize=15)

    ax[0, 1].plot(range(2, len(silhouette) + 2), silhouette)
    ax[0, 1].set_ylabel('SIL', fontsize=15)
    ax[0, 1].set_xlabel('K', fontsize=15)

    ax[1, 0].plot(range(2, len(calinski) + 2), calinski)
    ax[1, 0].set_ylabel('CAL', fontsize=15)
    ax[1, 0].set_xlabel('K', fontsize=15)

    ax[1, 1].plot(range(2, len(bouldin) + 2), bouldin)
    ax[1, 1].set_ylabel('BOUL', fontsize=15)
    ax[1, 1].set_xlabel('K', fontsize=15)
    
    for axis in ax.flat:
        axis.xaxis.set_major_locator(MultipleLocator(5))
        axis.xaxis.set_minor_locator(MultipleLocator(1))
        axis.tick_params(which='major', length=9)
        axis.tick_params(which='minor', length=7, color='r')

    plt.show()
    fig.tight_layout()

In [61]:
def run_xmeans(df_to_copy, X):
    
    max_k = 10
    silhouette = dict()

    #-------------------------------------Choose best k--------------------------------------#
    
    for k in range(3, max_k+1):
        
        #-----------------------------------run algorithm------------------------------------#
        amount_initial_centers = 2
        initial_centers = kmeans_plusplus_initializer(X, amount_initial_centers).initialize()

        xmeans_instance = xmeans.xmeans(X, initial_centers, k)
        xmeans_instance.process()

        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()

        
        #--------------------------retrieve cluster labels for the dataset-------------------#
        
        label_index = list(np.arange(len(clusters))) #list of labels for clusters

        diz_temp = dict()
        for label, cluster_content in zip(label_index, clusters):
            diz_temp[label] = cluster_content

        df_copy = pd.DataFrame(X, columns = df_to_copy.columns, index = df_to_copy.index)

        for label, cluster in diz_temp.items():
            for index in cluster:
                df_copy.loc[df_copy.iloc[index].name, 'Labels'] = label

        df_copy['Labels'] = df_copy['Labels'].astype(int)

        xmeans_labels = np.array(df_copy['Labels'])
    
    
        #-----------------------------------calculate metrics--------------------------------#
        
        sil = silhouette_score(X, xmeans_labels)
        
        silhouette[k] = sil
        
    #-----------------------------------Run xmeans with best k------------------------------------#
    
    best_k = max(silhouette, key=silhouette.get)
    
    amount_initial_centers = 2
    initial_centers = kmeans_plusplus_initializer(X, amount_initial_centers).initialize()

    xmeans_instance = xmeans.xmeans(X, initial_centers, best_k)
    xmeans_instance.process()

    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    
    #--------------------------------Retrieve labels and content---------------------------------#
    
    label_index = list(np.arange(len(clusters))) #list of labels for clusters

    diz_temp = dict()
    for label, cluster_content in zip(label_index, clusters):
        diz_temp[label] = cluster_content

    df_copy = pd.DataFrame(X, columns = df_to_copy.columns, index = df_to_copy.index)

    for label, cluster in diz_temp.items():
        for index in cluster:
            df_copy.loc[df_copy.iloc[index].name, 'Labels'] = label

    df_copy['Labels'] = df_copy['Labels'].astype(int)

    xmeans_labels = np.array(df_copy['Labels'])
    
    #--------------------------------Retrieve silhouette of best k---------------------------------#
    
    dictionary_of_best_k = dict()
    
    sil_of_best_k = silhouette_score(X, xmeans_labels)
    #sse_of_best_k = inertia(len(clusters), X, xmeans_labels)

    
    dictionary_of_best_k = {'Combination': list(df_to_copy.columns), \
                            'kmax': best_k, \
                            'Silhouette': float('%.4f'%(sil_of_best_k))}
    
    
    return dictionary_of_best_k

In [62]:
from itertools import combinations
from tqdm import tqdm

In [63]:
df_complete = pd.read_csv('group_20_fma_numeric.csv', index_col = 0)

In [64]:
df_complete

Unnamed: 0_level_0,chroma_cens_02,chroma_cens_04,chroma_cens_06,chroma_cens_07,chroma_cens_09,chroma_cens_11,chroma_stft_05,chroma_stft_08,chroma_stft_10,chroma_stft_12,...,track_genre_top_Easy_Listening,track_genre_top_Instrumental,track_bit_rate,track_duration,track_listens,track_date_created_year,track_date_created_season,artist_favorites,album_favorites,album_type
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.534301,0.311606,0.336365,0.341694,0.292243,0.276052,0.341598,0.430064,0.381936,0.529627,...,0,0,256000,168,1293,1,3,9,4,0
3,0.331490,0.296091,0.368770,0.550413,0.464814,0.363497,0.391794,0.581584,0.510926,0.540496,...,0,0,256000,237,514,1,3,9,4,0
5,0.438305,0.298619,0.358552,0.411062,0.515737,0.338845,0.335509,0.353870,0.418968,0.467409,...,0,0,256000,206,1151,1,3,9,4,0
10,0.414595,0.309863,0.438154,0.616182,0.399730,0.369890,0.357122,0.420764,0.370725,0.388618,...,0,0,192000,161,50135,1,3,74,4,0
20,0.354114,0.256808,0.361826,0.415287,0.368049,0.419011,0.390003,0.575782,0.556986,0.491322,...,0,0,256000,311,361,1,3,10,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,0.239374,0.506758,0.610919,0.257752,0.253840,0.242582,0.512779,0.418091,0.422479,0.476460,...,0,0,320000,162,102,10,1,0,0,1
155317,0.376784,0.453340,0.290186,0.281965,0.324159,0.404858,0.579709,0.609059,0.567302,0.643192,...,0,0,320000,217,165,10,1,0,0,1
155318,0.297417,0.480687,0.499205,0.311303,0.363107,0.317513,0.582218,0.529418,0.496228,0.579227,...,0,0,320000,404,168,10,1,0,0,1
155319,0.376004,0.361030,0.479526,0.444307,0.405410,0.283908,0.628887,0.527227,0.411554,0.615549,...,0,0,320000,146,294,10,1,0,0,1


In [65]:
df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103708 entries, 2 to 155320
Data columns (total 56 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   chroma_cens_02                       103708 non-null  float64
 1   chroma_cens_04                       103708 non-null  float64
 2   chroma_cens_06                       103708 non-null  float64
 3   chroma_cens_07                       103708 non-null  float64
 4   chroma_cens_09                       103708 non-null  float64
 5   chroma_cens_11                       103708 non-null  float64
 6   chroma_stft_05                       103708 non-null  float64
 7   chroma_stft_08                       103708 non-null  float64
 8   chroma_stft_10                       103708 non-null  float64
 9   chroma_stft_12                       103708 non-null  float64
 10  mfcc_02                              103708 non-null  float64
 11  mfcc_03      

In [66]:
df_complete.shape

(103708, 56)

In [67]:
features_to_test = ['mfcc_20', 'mfcc_03', 'mfcc_07', \
                    'rmse_01', 'track_bit_rate', \
                    'track_duration', 'track_listens', 'album_favorites', 'artist_favorites']

In [68]:
len(features_to_test)

9

In [69]:
lista_totale = []

for i in range(2, len(features_to_test)+1):
    
    list_tot = list(combinations(features_to_test, i))
    
    for element in list_tot:
        lista_totale.append(list(element))

print(len(lista_totale))

502


In [70]:
global_dict = {'Combination': "", \
                'kmax': 0, \
                'Silhouette': 0}

for combination in tqdm(lista_totale):
    
    df = df_complete[combination]    #combination to test
    
    scaler = RobustScaler()
    X = scaler.fit_transform(df)     #scaled dataset
    
    dict_combination_temp = run_xmeans(df, X)
    
    if dict_combination_temp['Silhouette'] > global_dict['Silhouette']:
        global_dict = dict_combination_temp

100%|██████████| 502/502 [208:29:18<00:00, 1495.14s/it]    


In [71]:
global_dict

{'Combination': ['mfcc_20', 'rmse_01', 'track_listens', 'artist_favorites'],
 'kmax': 3,
 'Silhouette': 0.9126}

In [51]:
global_dict

{'Combination': ['mfcc_03', 'track_listens', 'artist_favorites'],
 'kmax': 3,
 'Silhouette': 0.9188}

In [41]:
global_dict

{'Combination': ['mfcc_20', 'track_listens'], 'kmax': 3, 'Silhouette': 0.8848}