# Thesis plots

*This notebook contains code responsible for creating plots which are included in the thesis.*

## I. Imports & functions

In [1]:
# required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go 
from plotly.offline import plot
import plotly.express as px
from dtaidistance import dtw_ndim
from sklearn.cluster import AgglomerativeClustering

# using R inside python
import rpy2.robjects.packages as rpackages
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri

rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()

# install R packages
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# run if not installed previously from requirements.txt
# utils.install_packages('clValid')
# utils.install_packages('symbolicDA')

# load R packages
clValid = importr('clValid')
symbolicDA = importr('symbolicDA')

import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

In [2]:
# functions
def agglomerative_clustering(data: pd.DataFrame, n_clusters: int, linkage: str) -> AgglomerativeClustering:
    """
    Perform hierarchical clustering.

    Args:
        data (pd.DataFrame): preprocessed dataframe with economic indexes
        n_clusters (int): number of clusters to be formed
        linkage (str): type of linkage criterion; 'average', 'complete' or 'single'

    Returns:
        AgglomerativeClustering: fitted clustering model
    """
    # transform input data into adequate structure - 3D numpy array
    data_t = data.melt(id_vars=['countrycode','country','year'])
    data_t = data_t.groupby(['countrycode','country','year','variable'])['value'].aggregate('mean').unstack('year')
    data_t = data_t.reset_index().drop('variable', axis=1).groupby(['countrycode', 'country']).agg(list)
    n_countries = data_t.shape[0] # number of points (countries)
    time_range =  data_t.shape[1] # time range
    n_vars = data.shape[1] - 3 # number of economic indexes
    # filling the array
    data_t_arr = np.empty(shape=(n_countries, time_range, n_vars))
    for i in range(n_countries):
        for j in range(time_range):
            data_t_arr[i][j] = np.array(data_t.iloc[i,j])
    # calculating distances between points (countries)
    dtw_matrix = dtw_ndim.distance_matrix_fast(data_t_arr, n_vars)
    # creating and fitting the model
    model = AgglomerativeClustering(
        n_clusters=n_clusters, affinity='precomputed', linkage=linkage, compute_distances=True)
    model.fit(dtw_matrix)
    return model

def plot_clustering(countries_df: pd.DataFrame, labels: np.array, colors: np.array) -> str:
    """
    Plot cartogram presenting clustering results for given countries.

    Args:
        countries (pd.DataFrame): Pandas Dataframe containing at least one column, named 'countrycode',
        with ISO-3166 alpha-3 codes of countries
        labels (np.array): cluster assignment generated by clustering model for given countries
        colors (np.array): list of colors to be assigned to clusters
    Returns:
        str: Plot in HTML form.
    """

    labels = labels.astype(str)
    countries_df["cluster"] = pd.Series(labels)

    colors = dict(zip(np.sort(np.unique(labels)), colors))

    # color_discrete_sequence = px.colors.qualitative.Pastel
    fig = px.choropleth(countries_df, locations='countrycode', color="cluster",
                        projection='conic conformal', color_discrete_map=colors,
                        hover_name="country", custom_data=["country", 'countrycode', 'cluster'],
                        title='Clustering results')
    fig.update_geos(lataxis_range=[35, 75], lonaxis_range=[-15, 45])  # customized to show Europe only
    fig.update_layout(margin=dict(l=20, r=20, t=40, b=20), paper_bgcolor='rgba(0,0,0,0)',
                        hoverlabel=dict(bgcolor="white", font_size=14), title_x=0.18, title_xref='paper',
    legend_title="Cluster", legend_font_size=22,
    legend_title_font_size=26)
    return fig

## II. Data

In [3]:
# reading data after standard preprocessing (normalization, imputation, smoothing)
data = pd.read_csv('./../data/data.csv')
# reading data after imputation (only)
data_orig = pd.read_csv('./../data/data_imputed.csv')
# reading data after box cox transformation
data_box = pd.read_csv('./../data/data_box.csv')

In [4]:
# creating distance matrix for searching for optimal parameters
# transform input data into adequate structure - 3D numpy array
data_t = data.melt(id_vars=['countrycode','country','year'])
data_t = data_t.groupby(['countrycode','country','year','variable'])['value'].aggregate('mean').unstack('year')
data_t = data_t.reset_index().drop('variable', axis=1).groupby(['countrycode', 'country']).agg(list)
n_countries = data_t.shape[0] # number of points (countries)
time_range =  data_t.shape[1] # time range
n_vars = data.shape[1] - 3 # number of economic indexes
# filling the array
data_t_arr = np.empty(shape=(n_countries, time_range, n_vars))
for i in range(n_countries):
    for j in range(time_range):
        data_t_arr[i][j] = np.array(data_t.iloc[i,j])
# calculating distances between points (countries)
dtw_matrix = dtw_ndim.distance_matrix_fast(data_t_arr, n_vars)

In [6]:
# extracting list of pairs (country name + country code) for plots
countries = data[['countrycode','country']].drop_duplicates().reset_index(drop=True)

In [64]:
# countries from the article to which obtained results will be compared
list_of_countries = ['AUT', 'BEL', 'BGR', 'CHE', 'CZE', 'DEU', 'DNK', 'ESP', 'EST', 'FIN', 
'FRA', 'GBR', 'GRC', 'HRV', 'HUN', 'IRL', 'ITA', 'LTU', 'LVA', 'NLD', 
'NOR', 'POL', 'PRT', 'ROU', 'SVK', 'SVN', 'SWE']

article_countries = countries[countries.countrycode.isin(list_of_countries)].reset_index(drop=True)

In [65]:
# labels from the article
labels_3 = np.array([0,0,2,0,2,0,0,0,1,0,0,0,2,2,0,0,0,1,1,0,0,0,0,2,2,2,0])
labels_4 = np.array([0,0,2,0,0,0,0,3,1,0,0,0,3,2,0,2,0,1,1,0,3,3,3,2,2,2,0])
labels_5 = np.array([0,0,4,0,0,0,0,3,1,0,0,0,3,2,0,3,0,1,1,0,3,3,3,4,2,2,0])

In [67]:
# obtained grouping
agg_4 = agglomerative_clustering(data, 4, 'complete')

In [93]:
# modifying cluster labels to obtain consistent coloring on the plots comparing results with the article
countries = data[['countrycode','country']].drop_duplicates().reset_index(drop=True)
# concatenating obtained labels and the ones proposed in the article
temp = pd.concat([countries, pd.Series(agg_4.labels_)], axis=1).reset_index(drop=True)
temp = pd.concat([temp[temp.countrycode.isin(list_of_countries)].reset_index(drop=True), pd.Series(labels_4)], axis=1)
temp.columns = ['countrycode', 'country', 'agg_old', 'article']
# modifying cluster labels
temp['agg_new'] = np.select([temp.agg_old == 2, temp.agg_old == 0, temp.agg_old == 1, temp.agg_old == 3,], 
                        [0, 3, 2, 1], 
                        default=8) # abstract value, not really needed
agg_4_labels_new = np.select([agg_4.labels_ == 2, agg_4.labels_ == 0, agg_4.labels_ == 1, agg_4.labels_ == 3,], 
                        [0, 3, 2, 1], 
                        default=8)

In [97]:
# list of colors to use in the plots
colors = ['#ffe8d6', '#ddbea9', '#cb997e', '#b7b7a4', '#a5a58d', '#6b705c', '#787D6B', '#848978', '#8F9484', '#999E8F']
colors = colors[:(len(np.unique(agg_4_labels_new)) + 1)]

In [116]:
# grouping proposed in the article
plot(plot_clustering(article_countries, labels_4, colors), filename='map1.html')

'map1.html'

In [114]:
# obtained grouping
plot(plot_clustering(countries, agg_4_labels_new, colors), filename='map2.html')

'map2.html'

In [5]:
# analyzing changes in grouping through the years based on 10-year long segments, taken each 3 years
year_grid = [x for x in range(1995, 2020, 3)]
countries = data_box[['countrycode', 'country']].drop_duplicates().reset_index(drop=True)
labels = []
for y in year_grid:
    data_trimmed = data_box.loc[data_box.year <= y, :].loc[data_box.year > y - 10, :]
    model = agglomerative_clustering(data_trimmed, 4, 'complete')
    labels.append(model.labels_)

In [7]:
# extracting grouping to present in the thesis
labels_98 = labels[1]
labels_07 = labels[4]
labels_10 = labels[5]
labels_16 = labels[7]

In [8]:
# modifying cluster labels to obtain consistent coloring on the plots comparing results with the article
segments = pd.concat([countries, pd.Series(labels_98), pd.Series(labels_07), pd.Series(labels_10), pd.Series(labels_16),], axis=1).reset_index(drop=True)
segments.columns = ['countrycode', 'country', 'year_98', 'year_07', 'year_10', 'year_16']
segments['year_98_new'] = np.select([segments.year_98 == 1, segments.year_98 == 2, 
        segments.year_98 == 0, segments.year_98 == 3,], 
                        [0, 1, 3, 2], default=8)
segments['year_10_new'] = np.select([segments.year_10 == 1, segments.year_10 == 0, 
segments.year_10 == 2, segments.year_10 == 3], 
                        [0, 1, 3, 2], default=8)

In [9]:
# first grouping (98')
colors = ['#ffe8d6', '#ddbea9', '#cb997e', '#b7b7a4', '#a5a58d', '#6b705c', '#787D6B', '#848978', '#8F9484', '#999E8F']
colors = colors[:(len(np.unique(segments.year_98_new)) + 1)]
plot(plot_clustering(countries, segments.year_98_new, colors), filename='seg1.html')

'seg1.html'

In [10]:
# second grouping (07')
plot(plot_clustering(countries, segments.year_07, colors), filename='seg2.html')

'seg2.html'

In [11]:
# third grouping (10')
plot(plot_clustering(countries, segments.year_10_new, colors), filename='seg3.html')

'seg3.html'

In [12]:
# fourth grouping (16')
plot(plot_clustering(countries, segments.year_16, colors), filename='seg4.html')

'seg4.html'