# Global Societal Endangerment Index (GSEI)

Development notebook for step 5: Normalisation

In [16]:
import pandas as pd
import numpy as np
import categories as my_cat
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [17]:
# load data
df = pd.read_csv("data/processing/multivariate_all_data.csv")
countries = pd.read_csv("data/processing/selected_countries.csv")
categories = my_cat.load('4.1')

df.head(10)

Unnamed: 0,Temp Change max 2012-2022,Disaster Affected Population 2020-2025,Disaster Damage US$T 2020-2025,Undernourishment (%),Renewable Freshwater per Capita (m3),Population below Poverty Line (%),Health Expenditure per Capita (US$),RoW Health Expenditure (%),No Education (%),Education Expenditure (US$M),...,pol_PC4,pol_PC5,pol_PC6,water_air_PC1,water_air_PC2,water_air_PC3,disease_PC1,disease_PC2,disease_PC3,disease_PC4
0,2.012,13016060.0,0.0,30.4,1178.737859,23.304235,81.0,8.0,58.38,819.55782,...,-0.157403,0.451415,-0.505144,1.653817,-0.551784,0.344034,0.090144,-0.350564,-0.636542,-0.235712
1,1.752,4544311.0,0.0,23.2,4285.82652,23.304235,101.0,7.0,23.932941,3047.73855,...,-0.231028,-0.00622,-0.390847,0.824321,-0.093015,-0.409678,1.992797,-0.404285,0.069539,-0.540231
2,2.028,84.0,0.0,4.5,9567.281462,23.138889,414.0,0.0,3.28,470.23552,...,0.586811,-0.234792,0.104068,-0.578045,-0.176162,0.080052,-1.05448,-0.055411,0.311029,0.26108
3,3.243,114957.3,13714920000.0,2.609091,4027.359502,12.5125,3.192,0.076923,3.064792,96.51866,...,0.219355,-0.118686,0.035244,-1.481279,0.295753,-0.300041,-2.077185,0.333092,-0.497454,-0.189044
4,1.699,346.0,0.0,2.7,15.665548,12.5125,2.315,0.076923,9.72,28330.640427,...,0.846905,0.356944,-0.048044,-1.44353,0.43028,-0.053021,-0.416271,-0.269504,1.131575,0.000319
5,1.123,979425.0,938469000.0,3.2,6444.169076,23.8,1.371,0.0,0.93,35105.55042,...,-0.065861,-0.021548,0.164119,-0.988338,-0.112748,-0.349038,-0.968767,0.076174,0.150992,-0.377158
6,2.772,21265.0,0.0,2.5,2315.430578,18.3,675.0,0.0,0.6,312.11636,...,0.133574,0.267761,-0.06094,-0.476079,0.129807,0.687241,-0.784127,-0.1249,0.362462,0.285434
7,1.131,525404.4,783647500.0,5.556757,563.081354,23.138889,1.085,0.0,7.609643,9814.788208,...,0.055407,-0.265085,-0.123456,-0.892232,-0.092651,-0.250677,-0.358819,-0.138401,-0.149787,-0.059672
8,1.499,290558.0,17135350000.0,2.5,19154.841667,12.5125,6.731,0.0,0.72,68159.44641,...,-0.129113,-0.056934,-0.283562,-1.573867,0.422967,-0.221956,-1.98958,0.30821,-0.402311,-0.163854
9,2.524,2220.0,0.0,2.5,6141.273635,12.5,5.852,0.0,1.29,22413.54147,...,-0.170182,-0.206508,0.250451,-1.554774,0.50779,-0.066242,-2.093434,0.355491,-0.613993,-0.151653


In [18]:
df.drop(columns=categories['pca']).columns.to_series().reset_index(drop=True)

0                  Temp Change max 2012-2022
1     Disaster Affected Population 2020-2025
2             Disaster Damage US$T 2020-2025
3                       Undernourishment (%)
4       Renewable Freshwater per Capita (m3)
5          Population below Poverty Line (%)
6        Health Expenditure per Capita (US$)
7                 RoW Health Expenditure (%)
8                           No Education (%)
9               Education Expenditure (US$M)
10                                Crime Rate
11                Government Debt (% of GDP)
12       Total Reserves (% of External Debt)
13                          Gini Coefficient
14                             Inflation (%)
15                    Youth Unemployment (%)
16          Renewable Energy Consumption (%)
17                   Secure Internet Servers
18                      Future Conflict Risk
19                Current Conflict Intensity
20     COVID-19 Expenditure per Capita (US$)
dtype: object

## Normalisation

Methods:

| Column Type                    | Best Scaling Method          |
|--------------------------------|------------------------------|
| Inversely related Indicators   | Invert, Then Scale           |
| Most Variables (With Outliers) | RobustScaler                 |
| Economic/Expenditure Variables | Log Transform → RobustScaler |
| Close to normal distribution   | StandardScaler               |
| Percentage-Based (0-100)       | Min-Max Scaling              |



In [19]:
# log transform
log_transform_cols = ['Health Expenditure per Capita (US$)',
                      'Education Expenditure (US$M)',
                      'COVID-19 Expenditure per Capita (US$)',
                      'Secure Internet Servers',
                      ]

df[log_transform_cols] = np.log1p(df[log_transform_cols])

df[log_transform_cols]

Unnamed: 0,Health Expenditure per Capita (US$),Education Expenditure (US$M),COVID-19 Expenditure per Capita (US$),Secure Internet Servers
0,4.406719,6.709984,1.098612,3.801282
1,4.624973,8.022483,1.098612,3.718870
2,6.028279,6.155358,4.400603,7.107698
3,1.433178,4.580044,5.642640,9.590117
4,1.198458,10.251734,5.940171,7.659483
...,...,...,...,...
187,5.468060,9.191747,4.400603,6.428868
188,3.663562,8.022483,1.098612,2.073919
189,6.347389,9.804633,1.098612,9.601571
190,4.343805,6.872947,1.514128,4.024647


In [20]:
# flip indicators that inversely correlate with the GSEI
inverse_indicators = ['Renewable Freshwater per Capita (m3)', 
                      'Renewable Energy Consumption (%)', 
                      'Total Reserves (% of External Debt)', 
                      'Secure Internet Servers',
                      'Education Expenditure (US$M)', 
                      'Health Expenditure per Capita (US$)',
                      'COVID-19 Expenditure per Capita (US$)'
                      ]

df[inverse_indicators] = df[inverse_indicators] * -1

In [21]:
# scale min-max
min_max_cols = ['Undernourishment (%)',
                'Population below Poverty Line (%)',
                'No Education (%)',
                'Gini Coefficient',
                'Crime Rate',
                'Renewable Energy Consumption (%)'
                ]

scaler = MinMaxScaler()
df[min_max_cols] = scaler.fit_transform(df[min_max_cols])

df[min_max_cols]

Unnamed: 0,Undernourishment (%),Population below Poverty Line (%),No Education (%),Gini Coefficient,Crime Rate,Renewable Energy Consumption (%)
0,0.571721,0.281428,0.782889,0.387175,0.905229,0.790356
1,0.424180,0.281428,0.320946,0.387175,0.612745,0.445493
2,0.040984,0.278625,0.043986,0.538530,0.668301,0.560797
3,0.002235,0.098517,0.041100,0.229940,0.232026,0.807128
4,0.004098,0.098517,0.130347,0.229940,0.687908,0.989518
...,...,...,...,...,...,...
187,0.059426,0.278625,0.102047,0.538530,0.081699,0.622642
188,0.758197,0.281428,0.770953,0.387175,0.750000,0.961216
189,0.114754,0.281428,0.075768,0.387175,0.831699,0.898323
190,0.674180,0.612203,0.187207,0.419032,0.553922,0.129979


In [22]:
# scale z-score
z_score_cols = ['Temp Change max 2012-2022',
                'Youth Unemployment (%)',
                ] + log_transform_cols

scaler = StandardScaler()
df[z_score_cols] = scaler.fit_transform(df[z_score_cols])

df[z_score_cols]

Unnamed: 0,Temp Change max 2012-2022,Youth Unemployment (%),Health Expenditure per Capita (US$),Education Expenditure (US$M),COVID-19 Expenditure per Capita (US$),Secure Internet Servers
0,0.201103,0.129327,-0.242164,0.749734,1.457810,1.003209
1,-0.252046,1.060859,-0.352886,0.047849,1.457810,1.030544
2,0.228989,0.779430,-1.064797,1.046331,-0.486849,-0.093482
3,2.346589,-0.255005,1.266343,1.888762,-1.218329,-0.916865
4,-0.344419,-0.805661,1.385419,-1.144288,-1.393555,-0.276501
...,...,...,...,...,...,...
187,-0.795825,-0.237101,-0.780593,-0.577439,-0.486849,0.131677
188,-0.192701,1.434025,0.134847,0.047849,1.457810,1.576151
189,-0.149216,3.794299,-1.226685,-0.905192,1.457810,-0.920664
190,-0.778396,-0.521899,-0.210247,0.662586,1.213098,0.929123


In [23]:
def find_outliers(df, column):
    # Calculate Q1, Q3, and IQR
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    # Define outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Find outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers = pd.DataFrame()
for col in df.drop(columns=min_max_cols + z_score_cols + categories['pca']).columns:
    outliers = pd.concat([outliers, find_outliers(df, col)[[col]].count()], axis=0)
outliers.reset_index(inplace=True)
outliers['Outlier Percentage'] = outliers[0].apply(lambda x: round(x/len(df)*100, 2))
outliers.columns = ['Indicator', 'Outliers', 'Outlier Percentage']
outliers.sort_values(by='Outliers', ascending=False)

Unnamed: 0,Indicator,Outliers,Outlier Percentage
8,Current Conflict Intensity,41.0,21.35
1,Disaster Damage US$T 2020-2025,39.0,20.31
0,Disaster Affected Population 2020-2025,36.0,18.75
7,Future Conflict Risk,34.0,17.71
2,Renewable Freshwater per Capita (m3),25.0,13.02
4,Government Debt (% of GDP),24.0,12.5
3,RoW Health Expenditure (%),22.0,11.46
6,Inflation (%),22.0,11.46
5,Total Reserves (% of External Debt),12.0,6.25


In [24]:
# scale the rest robustly
robust_cols = df.drop(columns=min_max_cols + z_score_cols + categories['pca']).columns.tolist()

scaler = RobustScaler()
df[robust_cols] = scaler.fit_transform(df[robust_cols])

df[robust_cols]

Unnamed: 0,Disaster Affected Population 2020-2025,Disaster Damage US$T 2020-2025,Renewable Freshwater per Capita (m3),RoW Health Expenditure (%),Government Debt (% of GDP),Total Reserves (% of External Debt),Inflation (%),Future Conflict Risk,Current Conflict Intensity
0,6.856345,-0.027431,0.164936,16.000000,2.026878,-0.725598,2.295912,3.341684,27.916667
1,2.338438,-0.027431,-0.125424,14.000000,2.026878,1.076412,0.945002,1.204861,0.000000
2,-0.084957,-0.027431,-0.618980,0.000000,0.125264,0.090114,-0.201440,-0.053975,0.000000
3,-0.023696,17.448211,-0.101270,0.153846,0.502856,-0.130478,-0.028030,0.055209,2.320000
4,-0.084818,-0.027431,0.273626,0.153846,-0.840134,-0.130478,-0.605732,-0.045190,0.000000
...,...,...,...,...,...,...,...,...,...
187,0.195191,0.971098,-2.513770,0.000000,0.056763,-0.158099,0.029005,-0.081607,0.000000
188,0.920435,0.002571,0.269806,14.000000,0.879738,1.066536,2.295912,3.598530,75.833333
189,6.467445,4.955499,0.207019,14.000000,2.026878,0.666403,-0.031862,1.554474,0.416667
190,6.465925,-0.027431,-0.107225,0.000000,-0.432099,1.473344,0.588905,0.000848,0.000000
