In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.cluster
from sklearn.cluster import AgglomerativeClustering

In [9]:
"plot dendrogram function from scikit-learn's example"
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [10]:
cleaned_data = pd.read_csv("../data/cleaned_sustainability_w_regions.csv")
metadata_columns = ['Country Name', 'Country Code', 'Year', 'Continent', 'sub-region', 'intermediate-region', 'sub-region-code', 'intermediate-region-code']
data_columns = [col for col in cleaned_data.columns if col not in metadata_columns]

metadata_dict = {col: cleaned_data[col] for col in metadata_columns}
metadata = pd.DataFrame(metadata_dict)

data_dict = {col: cleaned_data[col] for col in data_columns}
data = pd.DataFrame(data_dict)

data = data.fillna(0)
data

Unnamed: 0,Access to electricity (% of population) - EG.ELC.ACCS.ZS,Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG,"Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS",Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS,Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS,Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS,Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS,"Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5",Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS,Children out of school (% of primary school age) - SE.PRM.UNER.ZS,...,"Annual production-based emissions of carbon dioxide (CO2), measured in million tonnes",Gini index (World Bank estimate) - SI.POV.GINI,Income Classification (World Bank Definition),Individuals using the Internet (% of population) - IT.NET.USER.ZS,"Life expectancy at birth, total (years) - SP.DYN.LE00.IN","Population, total - SP.POP.TOTL",Regime Type (RoW Measure Definition),Rural population (% of total population) - SP.RUR.TOTL.ZS,Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS,Urban population (% of total population) - SP.URB.TOTL.IN.ZS
0,91.660398,0.000000,15.392211,2.389134,0.000190,0.000190,0.000000,0.000000,51.834542,1.60268,...,2.378,0.0,High income,0.000000,0.000,0.0,0,53.283,0.000000,0.000
1,100.000000,0.000000,11.491930,2.509046,0.000214,0.000214,0.000000,0.000000,53.552517,0.32258,...,2.407,0.0,High income,17.100000,73.853,92892.0,0,53.661,0.000712,46.339
2,100.000000,0.000000,3.893620,2.721385,0.000223,0.000223,0.000000,0.000000,58.520863,1.81634,...,2.437,0.0,High income,18.800000,73.937,94992.0,0,54.028,0.000657,45.972
3,100.000000,0.000000,6.713863,2.733676,0.000243,0.000243,0.000000,0.000000,61.196794,3.32156,...,2.561,0.0,High income,20.800000,74.038,97016.0,0,54.394,0.000700,45.606
4,100.000000,0.000000,9.801576,2.678363,0.000256,0.000256,0.000000,0.000000,57.290048,2.17652,...,2.616,0.0,High income,23.000000,74.156,98744.0,0,54.760,0.000685,45.240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3282,32.300000,0.763955,-14.094023,2.310859,1.573499,0.000000,1.426705,6.034256,22.451986,0.00000,...,11.973,0.0,Low income,16.364740,58.410,13586710.0,Electoral Autocracy,67.496,5.531514,32.504
3283,33.700000,10.749009,-18.882831,2.381654,0.844003,0.000000,1.492774,7.157096,23.724442,0.00000,...,12.170,0.0,Low income,22.742818,59.534,13814642.0,Electoral Autocracy,67.615,4.561981,32.385
3284,39.676228,-0.716048,-11.199213,2.101249,0.823740,0.000000,1.496494,7.073321,27.438656,0.00000,...,10.815,0.0,Low income,23.119989,60.294,14030338.0,Electoral Autocracy,67.704,4.473632,32.296
3285,40.144283,28.172572,-12.700238,2.102515,1.300127,0.000000,1.422511,6.889214,32.799112,0.00000,...,10.247,44.3,Low income,24.400000,60.812,14236599.0,Electoral Autocracy,67.763,4.844665,32.237


In [11]:
column_headers = data.columns
np_data = np.array(data)
data = pd.DataFrame(np_data)
data.columns = column_headers

data

Unnamed: 0,Access to electricity (% of population) - EG.ELC.ACCS.ZS,Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG,"Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS",Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS,Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS,Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS,Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS,"Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5",Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS,Children out of school (% of primary school age) - SE.PRM.UNER.ZS,...,"Annual production-based emissions of carbon dioxide (CO2), measured in million tonnes",Gini index (World Bank estimate) - SI.POV.GINI,Income Classification (World Bank Definition),Individuals using the Internet (% of population) - IT.NET.USER.ZS,"Life expectancy at birth, total (years) - SP.DYN.LE00.IN","Population, total - SP.POP.TOTL",Regime Type (RoW Measure Definition),Rural population (% of total population) - SP.RUR.TOTL.ZS,Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS,Urban population (% of total population) - SP.URB.TOTL.IN.ZS
0,91.660398,0.0,15.392211,2.389134,0.00019,0.00019,0.0,0.0,51.834542,1.60268,...,2.378,0.0,High income,0.0,0.0,0.0,0,53.283,0.0,0.0
1,100.0,0.0,11.49193,2.509046,0.000214,0.000214,0.0,0.0,53.552517,0.32258,...,2.407,0.0,High income,17.1,73.853,92892.0,0,53.661,0.000712,46.339
2,100.0,0.0,3.89362,2.721385,0.000223,0.000223,0.0,0.0,58.520863,1.81634,...,2.437,0.0,High income,18.8,73.937,94992.0,0,54.028,0.000657,45.972
3,100.0,0.0,6.713863,2.733676,0.000243,0.000243,0.0,0.0,61.196794,3.32156,...,2.561,0.0,High income,20.8,74.038,97016.0,0,54.394,0.0007,45.606
4,100.0,0.0,9.801576,2.678363,0.000256,0.000256,0.0,0.0,57.290048,2.17652,...,2.616,0.0,High income,23.0,74.156,98744.0,0,54.76,0.000685,45.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3282,32.3,0.763955,-14.094023,2.310859,1.573499,0.0,1.426705,6.034256,22.451986,0.0,...,11.973,0.0,Low income,16.36474,58.41,13586710.0,Electoral Autocracy,67.496,5.531514,32.504
3283,33.7,10.749009,-18.882831,2.381654,0.844003,0.0,1.492774,7.157096,23.724442,0.0,...,12.17,0.0,Low income,22.742818,59.534,13814642.0,Electoral Autocracy,67.615,4.561981,32.385
3284,39.676228,-0.716048,-11.199213,2.101249,0.82374,0.0,1.496494,7.073321,27.438656,0.0,...,10.815,0.0,Low income,23.119989,60.294,14030338.0,Electoral Autocracy,67.704,4.473632,32.296
3285,40.144283,28.172572,-12.700238,2.102515,1.300127,0.0,1.422511,6.889214,32.799112,0.0,...,10.247,44.3,Low income,24.4,60.812,14236599.0,Electoral Autocracy,67.763,4.844665,32.237


In [15]:
n = 5
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
"set linkage = Ward, Maximum or complete linkage, Average linkage, Single linkage"
model.fit(data)
plt.title("World Sustainability Dendrogram")
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("x label")
plt.show()

ValueError: could not convert string to float: 'High income'