# Report plots

*This notebook contains the code responsible for creating plots which are presented on the 'Report' page of the application.*

## I. Imports & functions

In [15]:
# required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go 
from plotly.offline import plot
from tslearn.clustering import TimeSeriesKMeans
from dtaidistance import dtw_ndim
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# using R inside python
import rpy2.robjects.packages as rpackages
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri

rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

# install R packages
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# run if not installed previously from requirements.txt
# utils.install_packages('clValid')
# utils.install_packages('symbolicDA')

# load R packages
clValid = importr('clValid')
symbolicDA = importr('symbolicDA')

import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

In [16]:
# functions
def kmeans_clustering(data: pd.DataFrame, n_clusters: int) -> TimeSeriesKMeans:
    """
    Perform KMeans clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        n_clusters (int): number of clusters to be formed

    Returns:
        TimeSeriesKMeans: fitted clustering model
    """
    # transform input data into adequate structure - 3D numpy array
    data_agg = data.drop('year', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_agg.shape[0] # number of points (countries)
    time_range =  len(data['year'].drop_duplicates()) # time range
    n_vars = data.shape[1] - 3 # number of economic indexes
    # filling the array
    data_agg_arr = np.empty(shape=(n_countries, n_vars, time_range))
    for i in range(data_agg.shape[0]):
        for j in range(data_agg.shape[1]):
            data_agg_arr[i][j] = np.array(data_agg.iloc[i,j])
    # creating and fitting a model
    model = TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw')
    model.fit(data_agg_arr)
    return model

def agglomerative_clustering(data: pd.DataFrame, n_clusters: int, linkage: str) -> AgglomerativeClustering:
    """
    Perform hierarchical clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        n_clusters (int): number of clusters to be formed
        linkage (str): type of linkage criterion; 'average', 'complete' or 'single'

    Returns:
        AgglomerativeClustering: fitted clustering model
    """
    # transform input data into adequate structure - 3D numpy array
    data_t = data.melt(id_vars=['countrycode','country','year'])
    data_t = data_t.groupby(['countrycode','country','year','variable'])['value'].aggregate('mean').unstack('year')
    data_t = data_t.reset_index().drop('variable', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_t.shape[0] # number of points (countries)
    time_range =  data_t.shape[1] # time range
    n_vars = data.shape[1] - 3 # number of economic indexes
    # filling the array
    data_t_arr = np.empty(shape=(n_countries, time_range, n_vars))
    for i in range(n_countries):
        for j in range(time_range):
            data_t_arr[i][j] = np.array(data_t.iloc[i,j])
    # calculating distances between points (countries)
    dtw_matrix = dtw_ndim.distance_matrix_fast(data_t_arr, n_vars)
    # creating and fitting the model
    model = AgglomerativeClustering(
        n_clusters=n_clusters, affinity='precomputed', linkage=linkage, compute_distances=True)
    model.fit(dtw_matrix)
    return model

## II. Data

In [17]:
# reading data after standard preprocessing (normalization, imputation, smoothing)
data = pd.read_csv('./../data/data.csv')
# reading data after imputation (only)
data_orig = pd.read_csv('./../data/data_imputed.csv')
# reading data after box cox transformation
data_box = pd.read_csv('./../data/data_box.csv')

In [18]:
# creating distance matrix for searching for optimal parameters
# transform input data into adequate structure - 3D numpy array
data_t = data.melt(id_vars=['countrycode','country','year'])
data_t = data_t.groupby(['countrycode','country','year','variable'])['value'].aggregate('mean').unstack('year')
data_t = data_t.reset_index().drop('variable', axis=1).groupby(['countrycode', 'country']).agg(list)
n_countries = data_t.shape[0] # number of points (countries)
time_range =  data_t.shape[1] # time range
n_vars = data.shape[1] - 3 # number of economic indexes
# filling the array
data_t_arr = np.empty(shape=(n_countries, time_range, n_vars))
for i in range(n_countries):
    for j in range(time_range):
        data_t_arr[i][j] = np.array(data_t.iloc[i,j])
# calculating distances between points (countries)
dtw_matrix = dtw_ndim.distance_matrix_fast(data_t_arr, n_vars)

## III. Metrics (KMeans & Agglomerative)

In [5]:
# clustering algorithms comparison
# arrays for metrics values
# results already saved to csv - read in the next chunk
k_max = 8
silhouette = []
chscore = []
dunnindex = []
for k in range(2, k_max+1): # KMeans
    kmeans = kmeans_clustering(data, k)
    silhouette.append(silhouette_score(dtw_matrix, kmeans.labels_))
    chscore.append(symbolicDA.index_G1d(dtw_matrix, kmeans.labels_+1)[0])
    dunnindex.append(clValid.dunn(dtw_matrix, kmeans.labels_+1)[0])
for link in ['average', 'complete', 'single']: # Agglomerative (different linkages)
    for k in range(2, k_max+1):
        agg = agglomerative_clustering(data, k, linkage=link)
        silhouette.append(silhouette_score(dtw_matrix, agg.labels_))
        chscore.append(symbolicDA.index_G1d(dtw_matrix, agg.labels_+1)[0])
        dunnindex.append(clValid.dunn(dtw_matrix, agg.labels_+1)[0])
metrics = pd.DataFrame({'Silhouette' : silhouette, 'Calinski-Harabasz Index': chscore, 'Dunn Index': dunnindex})
metrics['algorithm'] = pd.Series(['K-Means']*7 + ['Agglomerative average-linkage']*7 + ['Agglomerative complete-linkage']*7 + ['Agglomerative single-linkage']*7)
metrics['n_clusters'] = pd.Series([x for x in range(2,9)]*4)
metrics = metrics[['algorithm', 'n_clusters', 'silhouette', 'chscore', 'dunnindex']]

In [10]:
# saving results
# metrics.to_csv('./../data/metrics_results.csv', index=False)

In [22]:
# reading results
metrics=pd.read_csv('./../data/metrics_results.csv')

In [23]:
# initializing figure
fig = go.Figure()
buttons = list()
for i in range(metrics.shape[1]-2):
    m = metrics.columns[i+2,]
    df_test = metrics[['algorithm','n_clusters', m]]

    # transposing data
    df_test_transposed = df_test.pivot_table(index='algorithm', columns=['n_clusters'], values=m).reset_index()
    df_test_final = df_test_transposed.rename_axis('').rename_axis("", axis="columns").set_index('algorithm')

    # adding traces
    for alg in df_test_final.index:
        if i==0: # setting first layer to be visible on the load
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[alg],
                    name=alg, visible=True))            
        else:
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[alg],
                    name=alg, visible=False))
    n_of_countries = df_test_final.shape[0]
    # setting visibility
    visible = [False]*n_of_countries*i + [True]*n_of_countries + [False]*n_of_countries*(n_of_countries-i-1)
    buttons.append(dict(label = m,
                method = 'update',
                args = [{'visible': visible},
                        {'title': m}]))    
fig.update_layout(dict(updatemenus=[dict(
    type='dropdown', buttons=buttons, xanchor='right', x=1, y=1.15, active=0)],
    title='Metrics', xaxis_title="Number of clusters",
    yaxis_title="Metric value",
    legend_title="Algorithm", legend_font_size=16,
    legend_title_font_size=18))
fig.update_xaxes(tickfont_size= 16, title_font_size=18)
fig.update_yaxes(tickfont_size= 16, title_font_size=18)
# saving plot to HTML file
plot(fig, filename='metrics.html')

'metrics.html'

## IV. DBSCAN

In [12]:
countries = data[['countrycode', 'country']].drop_duplicates().reset_index(drop=True)
# creating a plot with groupings created by DBSCAN algorithm with different parameters
eps_grid = [3, 3.1, 3.2, 3.3, 3.4, 3.5]
min_samples_grid = [3, 4, 5, 6, 7]
plot_data = []
for eps in eps_grid:
    for min_samples in min_samples_grid:
        model = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        model.fit(dtw_matrix)
        labels = model.labels_.astype(str)
        countries["cluster"] = pd.Series(labels)
        countries['cluster'] = np.where(countries['cluster'] == '-1', 'outlier', countries['cluster'])
        # adding layers with different groupings
        plot_data.append(dict(type='choropleth',
                                locations=countries['countrycode'].astype(str),
                                z=model.labels_.astype(str),
                                colorscale=[[0, '#718355'], [0.33, '#ffe8d6'], [0.6, '#ddbea9'], [1, '#cb997e']],
                                showscale=False,
                                text = countries.apply(
                                    lambda row: f"<b>{row['country']}</b><br>ISO code: \
                                    {row['countrycode']}<br>Cluster: {row['cluster']} ",axis=1),
                                hoverinfo = "text"))
# setting visibility of layers
steps = []
i = 0
for eps in eps_grid:
    for min_samples in min_samples_grid:
        step = dict(method='restyle',
                    args=['visible', [False] * len(plot_data)],
                    label='[{}, {}]'.format(eps, min_samples))
        step['args'][1][i] = True
        steps.append(step)
        i += 1
# adding slider with parameters values
sliders = [dict(active=0,
                steps=steps,
                currentvalue={'prefix': 'Eps, min_samples - '},
                len=0.9,
                xanchor='center',
                pad={"l":20,"r":20, "t":1},
                ticklen=8,
                x=0.5)]
# customizing figure layout
layout = dict(geo=dict(projection={'type': 'conic conformal'}, lataxis={'range': [35, 75]},
                        lonaxis={'range': [-15, 45]}), sliders=sliders, title='DBSCAN')
fig = go.Figure(dict(data=plot_data, layout=layout))
fig.update_traces(showlegend=False, selector=dict(type='choropleth'))
fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=50, b=20), paper_bgcolor='rgba(0,0,0,0)',
                    hoverlabel=dict(bgcolor="white", font_size=14), title_x=0.5, title_xref='paper')
# saving plot to HTML file
plot(fig, filename='dbscan.html')

'dbscan.html'

## V. Series

In [9]:
# plotting figure presenting all economic indicators for all countries
fig = go.Figure()
buttons = list()
for i in range(data_orig.shape[1] - 3):
    ind = data_orig.columns[i + 3,]
    df_test = data_orig[['countrycode', 'year', ind]]
    # transposing data
    df_test_transposed = df_test.pivot_table(index='countrycode', columns=['year'], values=ind).reset_index()
    df_test_final = df_test_transposed.rename_axis('').rename_axis("", axis="columns"
                                                                    ).set_index('countrycode')
    # adding traces
    countries = data[['countrycode', 'country']].drop_duplicates().reset_index(drop=True).set_index('countrycode')
    for countrycode in df_test_final.index:
        if i == 0: # setting first layer to be visible
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[countrycode],
                                        name=countrycode, visible=True,
                                        text=[countries.loc[countrycode, 'country']] * 30,
                                        hovertemplate=
                                        "Country: %{text}<br>" +
                                        "Year: %{x}<br>" +
                                        "Value: %{y}" +
                                        "<extra></extra>", 
                                        ))
        else: # adding rest of the layers
            fig.add_trace(go.Scatter(x=df_test_final.columns, y=df_test_final.loc[countrycode],
                                        name=countrycode, visible=False,
                                        text=[countries.loc[countrycode, 'country']] * 30,
                                        hovertemplate=
                                        "Country: %{text}<br>" +
                                        "Year: %{x}<br>" +
                                        "Value: %{y}" +
                                        "<extra></extra>", 
                                        ))
    n_of_countries = df_test_final.shape[0]
    visible = [False] * n_of_countries * i + [True] * n_of_countries + [False] * n_of_countries * (
            n_of_countries - i - 1)
    buttons.append(dict(label=ind, method='update', args=[{'visible': visible}, {'title': ind}]))
# customizing map layout
updatemenus = list([dict(active=0, buttons=buttons, xanchor='right', x=1, y=1.15)])
fig.update_layout(updatemenus=updatemenus, title='Series',
                    title_x=0, title_xref='paper', margin=dict(l=20, r=20, t=20, b=20))
# saving plot to HTML file
plot(fig, filename='series.html')

'plots/series.html'

## VI. Segments

In [22]:
# analyzing changes in grouping through the years based on 10-year long segments, taken each 3 years
year_grid = [x for x in range(1995, 2020, 3)]
plot_data = []
steps = []
countries = data_box[['countrycode', 'country']].drop_duplicates().reset_index(drop=True)
i = 0
for y in year_grid:
    data_trimmed = data_box.loc[data_box.year <= y, :].loc[data_box.year > y - 10, :]
    model = agglomerative_clustering(data_trimmed, 4, 'complete')
    # changing countries order to preserve the same colors for a given cluster for each grouping for better visualisation
    order = [11, 30, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                28, 29, 1, 31, 32, 33, 34, 35, 36, 37, 38]
    dictionary = {k: None for k in np.unique(model.labels_)}
    label = 0
    for j in order:
        if dictionary[model.labels_[j]] is None:
            dictionary[model.labels_[j]] = label
            label += 1
        model.labels_[j] = dictionary[model.labels_[j]]
    labels = model.labels_.astype(str)
    countries["cluster"] = pd.Series(labels)
    # adding layers with groupings
    plot_data.append(dict(type='choropleth',
                            locations=countries['countrycode'].astype(str),
                            customdata=["country", 'countrycode', 'cluster'],
                            text=countries.apply(lambda row: f"<b>{row['country']}</b><br>ISO code: {row['countrycode']}<br>Cluster: {row['cluster']} ", axis=1),
                            hoverinfo="text",
                            z=model.labels_,  showscale = False, # colorscale = ['#f1faee', '#a8dadc', '#457b9d']))
                            colorscale=[[0, '#f1faee'], [0.33, '#a8dadc'], [0.66, '#457b9d'], [1, '#1d3557']]))
    # setting visibility
    step = dict(method='restyle',
                args=['visible', [False] * len(year_grid)],
                label='{}'.format(y))
    step['args'][1][i] = True
    steps.append(step)
    i += 1
# adding slider with time range
sliders = [dict(active=0,
                pad={"t": 1},
                steps=steps)]
# customizing map layout
layout = dict(geo=dict(projection={'type': 'conic conformal'}, lataxis={'range': [35, 75]},
                        lonaxis={'range': [-15, 45]}),
                sliders=sliders, showlegend=False)

fig = go.Figure(dict(data=plot_data,layout=layout))
fig.update_layout(title='Business cycles synchronization', showlegend=False, margin=dict(l=10, r=10, t=50, b=20), paper_bgcolor='rgba(0,0,0,0)',
                    hoverlabel=dict(bgcolor="white", font_size=14), title_x=0.5, title_xref='paper')
# saving plot to HTML file
plot(fig, filename='segments.html')

'segments.html'