# Evaluation

*This notebook contains evaluation and comparison of algorithms used in the project, alongside with different distance calculation methods and transformation used. Further analysis consisting of multiple interactive and non-interactive plots can be found in 'Report_plots' notebook and 'Thesis_plots' notebook in 'plots' directory.*

## I. Imports & functions

In [26]:
# required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot
from tslearn.clustering import TimeSeriesKMeans
from dtaidistance import dtw_ndim
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import euclidean_distances
import skfda

# using R inside python
import rpy2.robjects.packages as rpackages
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri

rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

# install R packages
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# run if not installed previously from requirements.txt
# utils.install_packages('clValid')
# utils.install_packages('symbolicDA')

# load R packages
clValid = importr('clValid')
symbolicDA = importr('symbolicDA')
stats = importr('stats')

import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

pd.options.display.max_columns = None
pd.options.display.max_colwidth = None


h5py not installed, hdf5 features will not be supported.
Install h5py to use hdf5 features: http://docs.h5py.org/



In [27]:
# functions
def kmeans_clustering(data: pd.DataFrame, n_clusters: int, metric: str) -> TimeSeriesKMeans:
    """
    Perform KMeans clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        n_clusters (int): number of clusters to be formed

    Returns:
        TimeSeriesKMeans: fitted clustering model
    """
    # transform input data into adequate structure - 3D numpy array
    data_agg = data.drop('year', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_agg.shape[0] # number of points (countries)
    time_range =  len(data['year'].drop_duplicates()) # time range
    n_vars = data.shape[1] - 3 # number of economic indexes
    # filling the array
    data_agg_arr = np.empty(shape=(n_countries, n_vars, time_range))
    for i in range(data_agg.shape[0]):
        for j in range(data_agg.shape[1]):
            data_agg_arr[i][j] = np.array(data_agg.iloc[i,j])
    # creating and fitting a model
    model = TimeSeriesKMeans(n_clusters=n_clusters, metric=metric)
    model.fit(data_agg_arr)
    return model

def agglomerative_clustering(matrix: np.matrix, n_clusters: int, linkage: str) -> AgglomerativeClustering:
    """
    Perform hierarchical clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        n_clusters (int): number of clusters to be formed
        linkage (str): type of linkage criterion; 'average', 'complete' or 'single'

    Returns:
        AgglomerativeClustering: fitted clustering model
    """
    # creating and fitting the model
    model = AgglomerativeClustering(
        n_clusters=n_clusters, affinity='precomputed', linkage=linkage, compute_distances=True)
    model.fit(matrix)
    return model

def dbscan_clustering(matrix: np.matrix, eps: float, min_samples: int) -> DBSCAN:
    """
    Perform DBSCAN clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        eps (float): maximum distance between two points for them to be considered as neighbouring
        min_samples (int): number of samples in a neighborhood for a point to be considered as a core point

    Returns:
        DBSCAN: fitted clustering model
    """
    # creating and fitting the model
    model = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    model.fit(matrix)
    return model

In [28]:
def calculate_dtw(data: pd.DataFrame) -> np.matrix:
    """
    Calculate distances between countries according to Dynamic Time Warping method.
 
    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        
    Returns:
        np.matrix: square matrix containing distances between countries
    """
    # creating distance matrix for searching for optimal parameters
    # transform input data into adequate structure - 3D numpy array
    data_t = data.melt(id_vars=['countrycode','country','year'])
    data_t = data_t.groupby(['countrycode','country','year','variable'])['value'].aggregate('mean').unstack('year')
    data_t = data_t.reset_index().drop('variable', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_t.shape[0] # number of points (countries)
    time_range =  data_t.shape[1] # time range
    n_vars = data.shape[1] - 3 # number of economic indexes
    # filling the array
    data_t_arr = np.empty(shape=(n_countries, time_range, n_vars))
    for i in range(n_countries):
        for j in range(time_range):
            data_t_arr[i][j] = np.array(data_t.iloc[i,j])
    # calculating distances between points (countries)
    dtw_matrix = dtw_ndim.distance_matrix_fast(data_t_arr, n_vars)
    return dtw_matrix

def calculate_euc(data: pd.DataFrame) -> np.matrix:
    """
    Calculate distances between countries according to Euclidean distance measure.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        
    Returns:
        np.matrix: square matrix containing distances between countries
    """
    # creating distance matrix for searching for optimal parameters
    # transform input data into adequate structure - 3D numpy array
    data_t = data.melt(id_vars=['countrycode','country','year'])
    data_t = data_t.groupby(['countrycode','country','year', 'variable'])['value'].aggregate('mean').unstack('variable')
    data_t = data_t.reset_index().drop('year', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_t.shape[0] # number of points (countries)
    n_vars =  data.shape[1] - 3 # number of economic indexes
    time_range = len(data_t.iloc[0,0]) # time range
    # filling the array
    data_t_arr = np.empty(shape=(n_countries, n_vars, time_range))
    for i in range(n_countries):
        for j in range(n_vars):
            data_t_arr[i][j] = np.array(data_t.iloc[i,j])
    data_t_arr_flat = np.empty(shape=(n_countries, n_vars*time_range))
    for i in range(data_t_arr.shape[0]):
        data_t_arr_flat[i] = np.concatenate(data_t_arr[i])
    # calculating distances between points (countries)
    euc_matrix = euclidean_distances(data_t_arr_flat, data_t_arr_flat)
    return euc_matrix

## II. Data

In [29]:
# reading data (after different transformations)
data = pd.read_csv('data/data.csv')
data_box = pd.read_csv('data/data_box.csv')
data_log = pd.read_csv('data/data_log.csv')
data_out = pd.read_csv('data/data_out.csv')

In [30]:
# extracting list of pairs (country name + country code) for plots
countries = data[['countrycode','country']].drop_duplicates().reset_index(drop=True)

In [31]:
# calculating distance matrix for data after different transformations
euc_matrix = calculate_euc(data)
dtw_matrix = calculate_dtw(data)
euc_matrix_box = calculate_euc(data_box)
dtw_matrix_box = calculate_dtw(data_box)
euc_matrix_log = calculate_euc(data_log)
dtw_matrix_log = calculate_dtw(data_log)
euc_matrix_out = calculate_euc(data_out)
dtw_matrix_out = calculate_dtw(data_out)

In [335]:
# evaluation of KMeans and Agglomerative clustering methods
# comparison of different distance calculation methods and preprocessing pipelines
matrices = [euc_matrix, euc_matrix_box, euc_matrix_log, euc_matrix_out, dtw_matrix, dtw_matrix_box, dtw_matrix_log, dtw_matrix_out]
dataframes = [data, data_box, data_log, data_out]
metrics = ['euclidean', 'dtw']

k_max = 8
silhouette = []
chscore = []
dunnindex = []
# KMeans
for m in metrics:
    for d in dataframes:
        for k in range(2, k_max+1):
            kmeans = kmeans_clustering(d, k, m)
            silhouette.append(silhouette_score(dtw_matrix, kmeans.labels_))
            chscore.append(symbolicDA.index_G1d(dtw_matrix, kmeans.labels_+1)[0])
            dunnindex.append(clValid.dunn(dtw_matrix, kmeans.labels_+1)[0])
# Agglomerative (different linkages)
for m in matrices:
    for link in ['average', 'complete', 'single']: 
        for k in range(2, k_max+1):
            agg = agglomerative_clustering(m, k, linkage=link)
            silhouette.append(silhouette_score(m, agg.labels_))
            chscore.append(symbolicDA.index_G1d(m, agg.labels_+1)[0])
            dunnindex.append(clValid.dunn(m, agg.labels_+1)[0])
# dataframe with all the results (Hierarchical = Agglomerative, code written before nomenclature was changed)           
metrics = pd.DataFrame({'silhouette' : silhouette, 'chscore' : chscore, 'dunnindex' : dunnindex})
metrics['data'] = pd.Series(['Euc']*7 + ['Euc_box']*7 + ['Euc_log']*7 + ['Euc_out']*7 + ['Dtw']*7 + ['Dtw_box']*7 + ['Dtw_log']*7 + ['Dtw_out']*7 + ['Euc']*21 + ['Euc_box']*21 + ['Euc_log']*21 + ['Euc_out']*21 + ['Dtw']*21 + ['Dtw_box']*21 + ['Dtw_log']*21 + ['Dtw_out']*21)
metrics['algorithm'] = pd.concat([pd.Series(['KMeans']*56), pd.Series((['Hierarchical average']*7 + ['Hierarchical complete']*7 + ['Hierarchical single']*7)*8).reset_index(drop=True)], axis=0).reset_index(drop=True)
metrics['n_clusters'] = pd.Series([x for x in range(2,9)]*4*8)
metrics = metrics[['data', 'algorithm', 'n_clusters', 'silhouette', 'chscore', 'dunnindex']]

In [336]:
# creating a column with combination of distance method name, transformation name and algorithm name
metrics['name'] = metrics['data'] + " " + metrics['algorithm']
metrics = metrics.drop('data', axis=1)
metrics['algorithm'] = metrics['name']
metrics = metrics.drop('name', axis=1)

In [339]:
# selecting best results for each combination of distance method, transformation and algorithm
idx = metrics.groupby(['algorithm'])['silhouette'].transform(max) == metrics['silhouette']
metrics[idx].sort_values('algorithm')

Unnamed: 0,algorithm,n_clusters,silhouette,chscore,dunnindex
141,Dtw Hierarchical average,3,0.313889,6.571529,0.414722
149,Dtw Hierarchical complete,4,0.310625,62.274435,0.379686
154,Dtw Hierarchical single,2,0.37956,4.298158,0.511341
32,Dtw KMeans,6,0.256338,55.625135,0.421798
161,Dtw_box Hierarchical average,2,0.523408,23.191869,0.196471
171,Dtw_box Hierarchical complete,5,0.409175,98.027287,0.243128
176,Dtw_box Hierarchical single,3,0.245828,14.8256,0.196471
35,Dtw_box KMeans,2,0.090977,48.268526,0.176115
182,Dtw_log Hierarchical average,2,0.322313,3.837429,0.455987
191,Dtw_log Hierarchical complete,4,0.313243,75.468281,0.386062


In [302]:
# plotting the results 
# initializing figure
fig = go.Figure()
buttons = list()
for i in range(metrics.shape[1]-2):
    m = metrics.columns[i+2,]
    df_test = metrics[['algorithm','n_clusters', m]]

    # transposing data
    df_test_transposed = df_test.pivot_table(index='algorithm', columns=['n_clusters'], values=m).reset_index()
    df_test_final = df_test_transposed.rename_axis('').rename_axis("", axis="columns").set_index('algorithm')

    # adding traces
    for alg in df_test_final.index:
        if i==0: # setting first layer to be visible on the load
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[alg],
                    name=alg, visible=True))            
        else:
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[alg],
                    name=alg, visible=False))
    n_of_countries = df_test_final.shape[0]
    # setting visibility
    visible = [False]*n_of_countries*i + [True]*n_of_countries + [False]*n_of_countries*(n_of_countries-i-1)
    buttons.append(dict(label = m,
                method = 'update',
                args = [{'visible': visible},
                        {'title': m}]))    
fig.update_layout(dict(updatemenus=[dict(
    type='dropdown', buttons=buttons, xanchor='right', x=1, y=1.15, active=0)],
    title='Metrics'))
# saving plot to HTML file
plot(fig, filename='plots/metrics_2.html')

'metrics.html'

In [32]:
# evaluation of DBSCAN clustering method
# comparison of different distance calculation methods and preprocessing pipelines
matrices = [euc_matrix, euc_matrix_box, euc_matrix_log, euc_matrix_out, dtw_matrix, dtw_matrix_box, dtw_matrix_log, dtw_matrix_out]
silhouette = []
chscore = []
dunnindex = []
n_clusters = []
params = []
min_grid = [x for x in range(2, 11, 1)] # min_samples parameter
eps_grid = np.arange(0.1, 10.1, 0.1) # eps parameter
for matrix in matrices:
    for m in min_grid:
        for e in eps_grid:
            dbscan = dbscan_clustering(eps = e, min_samples = m, matrix=matrix)
            if len(set(dbscan.labels_)) < 3: # obtaining one cluster and outliers is not considered as a valid grouping
                silhouette.append(-2) # adding value from outside metrics range to ignore those results during aggregation
                chscore.append(-2)
                dunnindex.append(-2)
            else:
                silhouette.append(silhouette_score(matrix, dbscan.labels_))
                chscore.append(symbolicDA.index_G1d(matrix, dbscan.labels_+2)[0]) 
                dunnindex.append(clValid.dunn(matrix, dbscan.labels_+2)[0])
            n_clusters.append(len(set(dbscan.labels_))-1)
            params.append('[' + str(m) + ', ' + str(e) + ']')
metrics2 = pd.DataFrame({'params' : params, 'n_clusters' : n_clusters, 'silhouette' : silhouette, 'chscore' : chscore, 'dunnindex' : dunnindex})
metrics2['data'] = pd.Series(['Euc']*900 + ['Euc_box']*900 + ['Euc_log']*900 + ['Euc_out']*900 + ['Dtw']*900 + ['Dtw_box']*900 + ['Dtw_log']*900 + ['Dtw_out']*900)
metrics2 = metrics2[['data', 'params', 'n_clusters', 'silhouette', 'chscore', 'dunnindex']]

In [349]:
# selecting best results for each combination of distance method and transformation
idx2 = metrics2.groupby(['data'])['silhouette'].transform(max) == metrics2['silhouette']
metrics2[idx2].drop(['params'], axis=1).drop_duplicates()

Unnamed: 0,data,n_clusters,silhouette,chscore,dunnindex
31,Euc,2,0.188728,37.439123,0.289175
994,Euc_box,2,0.332514,23.861521,0.152342
2025,Euc_log,2,0.214227,37.68593,0.288846
2733,Euc_out,2,0.179228,36.650473,0.294404
3631,Dtw,2,0.188416,37.454872,0.289175
4573,Dtw_box,2,0.377507,31.992544,0.112566
5625,Dtw_log,2,0.213558,37.686497,0.288846
6333,Dtw_out,2,0.179019,36.66464,0.294404
