# Global Societal Endangerment Index (GSEI)

Development notebook for step 2: Data Selection

In [2]:
import pandas as pd
import numpy as np
import json

# my custom scripts
import country_standardisation as my_cs

## Data Loading and Preprocessing
Bring datasets into the right format and perform initial transformations to form the base indicators.

Total indicator count to load: 9 + 7 + 8 + 11 + 7 = 42

In [3]:
"""
Preprocessing for specific sources
"""

def transpose_world_bank_data(wb_df, year_range):
    """
    Transpose World Bank data from long to wide format.
    """
    # clean up rows
    # drop ANT, CHI, XKX because mapping them to another country would produce duplicate country rows that are very complicated to merge
    # drop all other codes that are not countries
    codes_to_remove = ['ANT','CHI','XKX',
    'AFE','AFW','ARB','CSS','CEB','EAR','EAS','EAP','TEA','EMU','ECS','ECA','TEC','EUU','FCS','HPC','HIC','IBD','IBT','IDB','IDX','IDA','LTE','LCN','LAC','TLA','LDC','LMY','LIC','LMC','MEA','MNA','TMN','MIC','NAC','INX','OED','OSS','PSS','PST','PRE','SST','SAS','TSA','SSF','SSA','TSS','UMC','WLD',  'FTI','LNX']
    wb_df = wb_df[~wb_df['Country Code'].isin(codes_to_remove)]
    wb_df = wb_df[~wb_df['Country Name'].str.contains("Data from", na=False)]
    wb_df = wb_df[~wb_df['Country Name'].str.contains("Last Updated", na=False)]
    wb_df = wb_df.dropna(how='all', axis=0)  # drop completely empty rows
    wb_df = wb_df.replace('..', np.nan)  # replace missing values with NaN
    # clean up columns
    wb_df['Country Name'] = my_cs.standardise_countries(wb_df['Country Code'], fuzzy_threshold=100)
    wb_df = wb_df.drop(columns=['Country Code', 'Series Code'])
    wb_df.columns = [col.split(" [")[0] if "[YR" in col else col for col in wb_df.columns]
    # pivot to wide format
    wb_df = wb_df.pivot(index='Country Name', columns='Series Name', values=[str(year) for year in year_range])
    wb_df.columns = ['_'.join(col).strip() for col in wb_df.columns.values]
    wb_df = wb_df.reset_index()
    # convert to numeric
    num_cols = wb_df.drop('Country Name', axis=1).apply(pd.to_numeric, errors='coerce')
    wb_df = pd.concat([wb_df['Country Name'], num_cols], axis=1)
    return wb_df

def transpose_who_data(who_df, year_range):
    """
    Transpose WHO data from long to wide format.
    """
    # clean up rows
    who_df['Countries'] = who_df['Countries'].replace("occupied Palestinian territory, including east Jerusalem", "Palestine, State of")
    # clean up columns
    who_df['Countries'] = my_cs.standardise_countries(who_df['Countries'], fuzzy_threshold=80)
    # pivot to wide format
    who_df = who_df.pivot(index='Countries', columns='Indicators', values=[str(year) for year in year_range])
    who_df.columns = ['_'.join(col).strip() for col in who_df.columns.values]
    who_df = who_df.reset_index()
    # convert to numeric
    num_cols = who_df.drop('Countries', axis=1).apply(pd.to_numeric, errors='coerce')
    who_df = pd.concat([who_df['Countries'], num_cols], axis=1)
    return who_df

def indicator_yearly_availability(processed_df):
    """
    Calculate the availability of data for each indicator and year.
    """
    indicators = processed_df.columns[1:]
    availability = processed_df[indicators].notnull().mean()
    # reshape to DataFrame
    availability_df = availability.to_frame(name='value').reset_index()
    availability_df.rename(columns={'index': 'indicator'}, inplace=True)
    # split indicator column
    availability_df[['year', 'indicatorName']] = availability_df['indicator'].str.split('_', n=1, expand=True)
    availability_df.drop('indicator', axis=1, inplace=True)
    # pivot to desired format
    return availability_df.pivot(index='indicatorName', columns='year', values='value').reset_index()

In [4]:
def merge(out_df, in_df, indicator_cols):
    """
    Merge indicator columns from in_df to out_df.
    """
    if indicator_cols[0] in out_df.columns:
        out_df = out_df.drop(columns=indicator_cols)
    out_df =  out_df.merge(in_df[['Country'] + indicator_cols], on='Country', how='left')
    
    print(out_df.columns.values)
    return out_df

### Environmental Risks


In [5]:
# start with comprehensive country list
env_data = pd.DataFrame(my_cs.all_countries(), columns=['Country'])

env_data.head(10)

Unnamed: 0,Country
0,Aruba
1,Afghanistan
2,Angola
3,Anguilla
4,Åland Islands
5,Albania
6,Andorra
7,United Arab Emirates
8,Argentina
9,Armenia


In [6]:
# Climate Change Vulnerability

# INDICATOR: Maximum relative temperature change (°C) in the last 10 years, compared to a 1951-1980 baseline

temp_change = pd.read_csv("data/UN_FAO_climate_change_indicators.csv")
# drop row with Country 'world' in temp_change
temp_change = temp_change[temp_change['Country'] != 'World']
# standardise country names
temp_change['Country'] = my_cs.standardise_countries(temp_change['ISO3'])
# calculate maximum value between F2012 and F2022
indicator_cols = ['Temp Change max 2012-2022']
temp_change[indicator_cols[0]] = temp_change[['F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].max(axis=1)
# merge
env_data = merge(env_data, temp_change[['Country'] + indicator_cols], indicator_cols)

['Country' 'Temp Change max 2012-2022']


In [7]:
# Exposure to Natural Disasters (earthquakes, floods, hurricanes)

disasters = pd.read_excel("data/EM-DAT_natural_disasters_ALL_2020-2025.xlsx")
# standardise country names
disasters['Country'] = my_cs.standardise_countries(disasters['ISO'])
# group by country, summing up INDICATORS: affected people + total damage
disasters_grouped = disasters.groupby('Country')[['No. Affected', 'Total Damage, Adjusted (\'000 US$)']].sum().reset_index()
disasters_grouped['Total Damage, Adjusted (\'000 US$)'] = disasters_grouped['Total Damage, Adjusted (\'000 US$)'] * 1000
# rename columns
indicator_cols = ['Disaster Affected Population 2020-2025', 'Disaster Damage US$T 2020-2025']
disasters_grouped.columns = ['Country'] + indicator_cols
# merge
env_data = merge(env_data, disasters_grouped, indicator_cols)

Using special case for 'SPI' -> 'Spain'
Using special case for 'SPI' -> 'Spain'
['Country' 'Temp Change max 2012-2022'
 'Disaster Affected Population 2020-2025' 'Disaster Damage US$T 2020-2025']


In [8]:
# Air and Water Pollution Levels

# INDICATORS:
# Years of lost life due to unsafe water, sanitation, and handwashing
# Years of lost life due to air pollution

air_water = pd.read_csv("data/IHME_GBD_environmental_risk_export_2021.csv")
# standardise country names
air_water['Location'] = my_cs.standardise_countries(air_water['Location'], fuzzy_threshold=79)
# rename columns
indicator_cols = ['Unsafe water, sanitation (YLL)', 'Air pollution (YLL)']
air_water.columns = ['Country'] + indicator_cols
# merge
env_data = merge(env_data, air_water, indicator_cols)

Using fuzzy match for 'Cape Verde' -> 'Cabo Verde'
Using fuzzy match for 'Cote d'Ivoire' -> 'Côte d'Ivoire'
Using special case for 'Democratic Republic of Congo' -> 'Congo, Democratic Republic of the'
Using fuzzy match for 'Iran (Islamic Republic of)' -> 'Iran, Islamic Republic of'
Using fuzzy match for 'Libyan Arab Jamahiriya' -> 'Libya'
Using fuzzy match for 'Republic of Congo' -> 'Congo'
Using fuzzy match for 'Republic of Korea' -> 'Korea, Republic of'
Using special case for 'Turkey' -> 'Türkiye'
['Country' 'Temp Change max 2012-2022'
 'Disaster Affected Population 2020-2025' 'Disaster Damage US$T 2020-2025'
 'Unsafe water, sanitation (YLL)' 'Air pollution (YLL)']


In [9]:
wb_dev2 = pd.read_csv("data/Worldbank_development_2.csv")
wb_dev2 = transpose_world_bank_data(wb_dev2, range(2014, 2024))

# check availability of data for each indicator over the years
indicator_yearly_availability(wb_dev2)

year,indicatorName,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Access to electricity (% of population),0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.0
1,"External debt stocks, total (DOD, current US$)",0.544186,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488
2,Individuals using the Internet (% of population),0.934884,0.930233,0.944186,0.953488,0.813953,0.865116,0.893023,0.883721,0.851163,0.274419
3,"Inflation, consumer prices (annual %)",0.874419,0.869767,0.869767,0.846512,0.832558,0.832558,0.804651,0.804651,0.8,0.75814
4,People using safely managed drinking water ser...,0.637209,0.637209,0.64186,0.64186,0.637209,0.637209,0.637209,0.627907,0.613953,0.0
5,Prevalence of moderate or severe food insecuri...,0.0,0.455814,0.460465,0.539535,0.572093,0.627907,0.683721,0.706977,0.702326,0.0
6,Prevalence of undernourishment (% of population),0.795349,0.795349,0.795349,0.795349,0.795349,0.790698,0.795349,0.795349,0.795349,0.0
7,Renewable energy consumption (% of total final...,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.330233,0.0
8,Renewable internal freshwater resources per ca...,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.0,0.0
9,Secure Internet servers (per 1 million people),0.976744,0.986047,0.995349,0.995349,0.990698,0.986047,0.995349,1.0,1.0,1.0


#### Year selection for Worldbank Development Indicators (dataset 2)

Based on the availability of data for the Worldbank Development Indicators, one year is selected for each indicator that balances recency and data availability. The missing data will be handled in the imputation step. The selected years are:

- 2022 (99%) <- Access to electricity (% of population)
- 2023 (55%) <- "External debt stocks, total (DOD, current US$)"
- 2022 (85%) <- Individuals using the Internet (% of population)
- 2023 (75%) <- "Inflation, consumer prices (annual %)"
- 2022 (61%) <- People using safely managed drinking water services (% of population)
- 2022 (70%) <- Prevalence of moderate or severe food insecurity in the population (%)
- 2022 (80%) <- Prevalence of undernourishment (% of population)
- 2021 (98%) <- Renewable energy consumption (% of total final energy consumption)
- 2021 (85%) <- Renewable internal freshwater resources per capita (cubic meters)
- 2023 (100%) <- Secure Internet servers (per 1 million people)
- 2021 (44%) <- Total reserves (% of total external debt)
- 2023 (85%) <- "Unemployment, total (% of total labor force) (modeled ILO estimate)"
- 2023 (85%) <- "Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"


In [10]:
# Water Scarcity & Food Security

# INDICATORS:
# Renewable internal freshwater resources per capita (cubic meters)
# People using safely managed drinking water services (% of population)
# Prevalence of moderate or severe food insecurity in the population (%)
# Prevalence of undernourishment (% of population)

water_food = wb_dev2[['Country Name', '2022_People using safely managed drinking water services (% of population)', '2022_Prevalence of moderate or severe food insecurity in the population (%)', '2022_Prevalence of undernourishment (% of population)', '2021_Renewable internal freshwater resources per capita (cubic meters)']]
# rename columns
indicator_cols = ['Safe Drinking Water (%)', 'Food Insecurity (%)', 'Undernourishment (%)', 'Renewable Freshwater per Capita (m3)']
water_food.columns = ['Country'] + indicator_cols
# merge
env_data = merge(env_data, water_food, indicator_cols)

['Country' 'Temp Change max 2012-2022'
 'Disaster Affected Population 2020-2025' 'Disaster Damage US$T 2020-2025'
 'Unsafe water, sanitation (YLL)' 'Air pollution (YLL)'
 'Safe Drinking Water (%)' 'Food Insecurity (%)' 'Undernourishment (%)'
 'Renewable Freshwater per Capita (m3)']


### Political Instability & Governance


In [11]:
pol_data = pd.DataFrame(my_cs.all_countries(), columns=['Country'])

In [12]:
# Governance Quality

wb_gov = pd.read_csv('data/Worldbank_governance.csv')
wb_gov = transpose_world_bank_data(wb_gov, range(2014, 2024))

# check availability of data for each indicator over the years
indicator_yearly_availability(wb_gov)

year,indicatorName,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Control of Corruption: Estimate,0.981132,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,1.0,1.0
1,Government Effectiveness: Estimate,0.981132,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,1.0,1.0
2,Political Stability and Absence of Violence/Te...,0.990566,0.990566,0.990566,0.990566,1.0,1.0,1.0,1.0,1.0,0.995283
3,Regulatory Quality: Estimate,0.981132,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,1.0,1.0
4,Rule of Law: Estimate,0.981132,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,0.990566,1.0,1.0
5,Voice and Accountability: Estimate,0.957547,0.957547,0.957547,0.957547,0.971698,0.976415,0.976415,0.976415,0.976415,0.962264


In [13]:
# use 2023 since all data is >95% available

# INDICATORS:
# Corruption Control
# Rule of Law
# Political Stability
# Government Effectiveness
# Regulatory Quality
# Voice and Accountability

governance = wb_gov[['Country Name'] + list(wb_gov.columns[wb_gov.columns.str.startswith('2023')])]
# rename columns
indicator_cols = ['Corruption Control', 'Rule of Law', 'Political Stability', 'Government Effectiveness', 'Regulatory Quality', 'Voice and Accountability']
governance.columns = ['Country'] + indicator_cols
# merge
pol_data = merge(pol_data, governance, indicator_cols)

['Country' 'Corruption Control' 'Rule of Law' 'Political Stability'
 'Government Effectiveness' 'Regulatory Quality'
 'Voice and Accountability']


In [14]:
# INDICATOR: Democracy, not Autocracy

regime = pd.read_csv('data/V-Dem-Institute_ERT_democracy.csv')
# select most recent year
regime = regime[regime['year'] == 2023]
# standardise country names
regime['country_name'] = my_cs.standardise_countries(regime['country_text_id'], fuzzy_threshold=100)
regime = regime[regime['country_name'] != my_cs.UNKNOWN_COUNTRY]
# select & rename columns
regime = regime[['country_name', 'reg_type']]
indicator_cols = ['Democracy, not Autocracy']
regime.columns = ['Country'] + indicator_cols
# merge
pol_data = merge(pol_data, regime, indicator_cols)

Country 'XKX' not found (best fuzzy match too low: Denmark (36))
Country 'PSG' not found (best fuzzy match too low: Cyprus (44))
Country 'SML' not found (best fuzzy match too low: Åland Islands (60))
Country 'ZZB' not found (best fuzzy match too low: Uzbekistan (60))
['Country' 'Corruption Control' 'Rule of Law' 'Political Stability'
 'Government Effectiveness' 'Regulatory Quality'
 'Voice and Accountability' 'Democracy, not Autocracy']


### Social Vulnerability


In [15]:
soc_data = pd.DataFrame(my_cs.all_countries(), columns=['Country'])

In [16]:
wb_dev1 = pd.read_csv("data/Worldbank_development_1_poverty.csv")
wb_dev1 = transpose_world_bank_data(wb_dev1, range(2014, 2024))

# check availability of data for each indicator over the years
indicator_yearly_availability(wb_dev1)

year,indicatorName,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Gini index,0.376744,0.390698,0.372093,0.35814,0.423256,0.35814,0.302326,0.330233,0.130233,0.018605
1,Income share held by highest 10%,0.376744,0.390698,0.372093,0.35814,0.423256,0.353488,0.302326,0.330233,0.130233,0.018605
2,Income share held by lowest 20%,0.376744,0.390698,0.372093,0.35814,0.423256,0.353488,0.302326,0.330233,0.130233,0.018605
3,Poverty headcount ratio at societal poverty li...,0.376744,0.390698,0.372093,0.35814,0.423256,0.35814,0.302326,0.330233,0.130233,0.018605


#### Year selection for Worldbank Development Indicators (dataset 1)

- 2021 (33%) <- Gini index
- 2021 (33%) <- Income share held by highest 10%
- 2021 (33%) <- Income share held by lowest 20%
- 2021 (33%) <- Poverty headcount ratio at societal poverty line (% of population)

In [17]:
# Poverty

# INDICATOR: Population percentage below societal poverty line

poverty = wb_dev1[['Country Name', '2021_Poverty headcount ratio at societal poverty line (% of population)']]
# rename columns
indicator_cols = ['Population below Poverty Line (%)']
poverty.columns = ['Country'] + indicator_cols
# merge
soc_data = merge(soc_data, poverty, indicator_cols)

['Country' 'Population below Poverty Line (%)']


In [18]:
who_ghe = pd.read_csv("data/WHO_GHE_data_selected.csv")
who_ghe = transpose_who_data(who_ghe, range(2020, 2024))

# check availability of data for each indicator over the years
indicator_yearly_availability(who_ghe)

Using special case for 'Democratic Republic of the Congo' -> 'Congo, Democratic Republic of the'
Using special case for 'Democratic Republic of the Congo' -> 'Congo, Democratic Republic of the'
Using special case for 'Democratic Republic of the Congo' -> 'Congo, Democratic Republic of the'
Using special case for 'Democratic Republic of the Congo' -> 'Congo, Democratic Republic of the'
Using special case for 'Democratic Republic of the Congo' -> 'Congo, Democratic Republic of the'
Using fuzzy match for 'Bolivia (Plurinational State of)' -> 'Bolivia, Plurinational State of'
Using fuzzy match for 'Bolivia (Plurinational State of)' -> 'Bolivia, Plurinational State of'
Using fuzzy match for 'Bolivia (Plurinational State of)' -> 'Bolivia, Plurinational State of'
Using fuzzy match for 'Bolivia (Plurinational State of)' -> 'Bolivia, Plurinational State of'
Using fuzzy match for 'Venezuela (Bolivarian Republic of)' -> 'Venezuela, Bolivarian Republic of'
Using fuzzy match for 'Venezuela (Bolivar

year,indicatorName,2020,2021,2022,2023
0,Current Health Expenditure (CHE),0.994845,0.989691,0.984536,0.103093
1,Current Health Expenditure (CHE) as % Gross Do...,1.0,1.0,0.994845,0.103093
2,Current Health Expenditure (CHE) per Capita in...,1.0,0.994845,0.989691,0.103093
3,Expenditure on COVID-19 per Capita in US$,0.309278,0.298969,0.175258,0.025773
4,Population (in thousands),0.989691,0.989691,0.989691,0.103093
5,Rest of the World (RoW) as % of Current Health...,0.278351,0.242268,0.21134,0.041237


#### Year selection for WHO Global Health Expenditure Indicators

- 2022 (98%) <- Current Health Expenditure (CHE)
- 2022 (98%) <- Current Health Expenditure (CHE) as % Gross Domestic Product (GDP)
- 2022 (98%) <- Current Health Expenditure (CHE) per Capita in US$
- 2021 (98%) <- Expenditure on COVID-19 per Capita in US$
- 2022 (98%) <- Rest of the World (RoW) as % of Current Health Expenditure (CHE)


In [19]:
# Health System Strength

# INDICATORS:
# Current Health Expenditure (CHE) per Capita in US$
# Rest of the World (RoW) as % of Current Health Expenditure (CHE)

health = who_ghe[['Countries', '2022_Current Health Expenditure (CHE) per Capita in US$', '2022_Rest of the World (RoW) as % of Current Health Expenditure (CHE)']]
# rename columns
indicator_cols = ['Health Expenditure per Capita (US$)', 'RoW Health Expenditure (%)']
health.columns = ['Country'] + indicator_cols
# merge
soc_data = merge(soc_data, health, indicator_cols)

['Country' 'Population below Poverty Line (%)'
 'Health Expenditure per Capita (US$)' 'RoW Health Expenditure (%)']


In [20]:
wb_edu = pd.read_csv("data/Worldbank_education.csv")
wb_edu = transpose_world_bank_data(wb_edu, [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2005, 2000])

# check availability of data for each indicator over the years
indicator_yearly_availability(wb_edu)

year,indicatorName,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,"Adult literacy rate, population 15+ years, bot...",0.20362,0.099548,0.239819,0.266968,0.208145,0.153846,0.239819,0.190045,0.162896,0.149321,0.357466,0.004525,0.0,0.0,0.0,0.0,0.0
1,Barro-Lee: Percentage of population age 15+ wi...,0.651584,0.651584,0.651584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Completion rate, lower secondary education, bo...",0.253394,0.280543,0.190045,0.303167,0.158371,0.285068,0.330317,0.108597,0.135747,0.104072,0.140271,0.013575,0.0,0.0,0.0,0.0,0.0
3,"Government expenditure on education, US$ (mill...",0.542986,0.497738,0.556561,0.533937,0.497738,0.542986,0.520362,0.488688,0.493213,0.520362,0.280543,0.085973,0.0,0.0,0.0,0.0,0.0


#### Year selection for Worldbank Education Indicators

- 2018 (35%) <- "Adult literacy rate, population 15+ years, both sexes (%)"
- 2010 (65%) <- Barro-Lee: Percentage of population age 15+ with no education
- 2014 (33%) <- "Completion rate, lower secondary education, both sexes (%)"
- 2017 (52%) <- "Government expenditure on education, US$ (millions)"

Older years are acceptable as education undergoes slower changes and has long-term effects.


In [21]:
# Access to Education

# INDICATORS:
# Population age 15+ literacy rate
# Population age 15+ with no education
# Government expenditure on education
# Lower secondary school completion rate

education = wb_edu[['Country Name', '2018_Adult literacy rate, population 15+ years, both sexes (%)', '2010_Barro-Lee: Percentage of population age 15+ with no education', '2014_Completion rate, lower secondary education, both sexes (%)', '2017_Government expenditure on education, US$ (millions)']]
# rename columns
indicator_cols = ['Literacy Rate (%)', 'No Education (%)', 'Lower Secondary Completion Rate (%)', 'Education Expenditure (US$M)']
education.columns = ['Country'] + indicator_cols
# merge
soc_data = merge(soc_data, education, indicator_cols)

['Country' 'Population below Poverty Line (%)'
 'Health Expenditure per Capita (US$)' 'RoW Health Expenditure (%)'
 'Literacy Rate (%)' 'No Education (%)'
 'Lower Secondary Completion Rate (%)' 'Education Expenditure (US$M)']


In [22]:
# INDICATOR: Crime Rate

crime = pd.read_csv("data/global_organised_crime_index_2023.csv")
# standardise country names
crime['Country'] = my_cs.standardise_countries(crime['Country'], fuzzy_threshold=79)
# select & rename columns
crime = crime[['Country', 'Criminality']]
indicator_cols = ['Crime Rate']
crime.columns = ['Country'] + indicator_cols
# merge
soc_data = merge(soc_data, crime, indicator_cols)

Using special case for 'Turkey' -> 'Türkiye'
Using special case for 'Korea, DPR' -> 'Korea, Democratic People's Republic of'
Using fuzzy match for 'Congo, Rep.' -> 'Congo'
Using fuzzy match for 'Russia' -> 'Russian Federation'
Using fuzzy match for 'Korea, Rep.' -> 'Korea, Republic of'
Using fuzzy match for 'Brunei' -> 'Brunei Darussalam'
Using special case for 'Congo, Dem. Rep.' -> 'Congo, Democratic Republic of the'
Using fuzzy match for 'St. Kitts and Nevis' -> 'Saint Kitts and Nevis'
Using fuzzy match for 'Micronesia (Federated States of)' -> 'Micronesia, Federated States of'
Using fuzzy match for 'St. Vincent and the Grenadines' -> 'Saint Vincent and the Grenadines'
Using fuzzy match for 'St. Lucia' -> 'Saint Lucia'
['Country' 'Population below Poverty Line (%)'
 'Health Expenditure per Capita (US$)' 'RoW Health Expenditure (%)'
 'Literacy Rate (%)' 'No Education (%)'
 'Lower Secondary Completion Rate (%)' 'Education Expenditure (US$M)'
 'Crime Rate']


### Economic Instability & Infrastructure


In [23]:
eco_data = pd.DataFrame(my_cs.all_countries(), columns=['Country'])

In [24]:
# Debt & Economic Resilience

# INDICATORS:
# General government debt as % of GDP
# Total reserves (% of total external debt)

debt = pd.read_csv("data/IMF_gov_debt.csv")
debt = debt.replace('no data', np.nan)
# standardise country names
debt.rename(columns={'General Government Debt (Percent of GDP)': 'Country'}, inplace=True)
debt['Country'] = my_cs.standardise_countries(debt['Country'], fuzzy_threshold=80)
debt = debt[debt['Country'] != my_cs.UNKNOWN_COUNTRY]
# convert to numeric
num_cols = debt.drop(columns=['Country']).apply(pd.to_numeric, errors='coerce')
debt = pd.concat([debt['Country'], num_cols], axis=1)
# select & rename columns
debt_latest = debt[['Country', '2023']]
indicator_cols = ['Government Debt (% of GDP)']
debt_latest.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, debt_latest, indicator_cols)

reserves = wb_dev2[['Country Name', '2021_Total reserves (% of total external debt)']]
# rename columns
indicator_cols = ['Total Reserves (% of External Debt)']
reserves.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, reserves, indicator_cols)

Using fuzzy match for 'China, People's Republic of' -> 'China'
Using special case for 'Congo, Dem. Rep. of the' -> 'Congo, Democratic Republic of the'
Country 'Kosovo' not found (best fuzzy match too low: Comoros (46))
Using fuzzy match for 'Micronesia, Fed. States of' -> 'Micronesia, Federated States of'
Using fuzzy match for 'North Macedonia ' -> 'North Macedonia'
Using fuzzy match for 'Taiwan Province of China' -> 'Taiwan, Province of China'
Using fuzzy match for 'Türkiye, Republic of' -> 'Türkiye'
['Country' 'Government Debt (% of GDP)']
['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)']


In [25]:
# Income Inequality

# INDICATORS:
# Gini index
# Income share held by highest 10%

income = wb_dev1[['Country Name', '2021_Gini index', '2021_Income share held by highest 10%']]
# rename columns
indicator_cols = ['Gini Coefficient', 'Income Share Top 10%']
income.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, income, indicator_cols)

['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)' 'Gini Coefficient'
 'Income Share Top 10%']


In [26]:
# INDICATOR: Inflation, Consumer Prices

inflation = wb_dev2[['Country Name', '2023_Inflation, consumer prices (annual %)']]
# rename columns
indicator_cols = ['Inflation (%)']
inflation.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, inflation, indicator_cols)

['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)' 'Gini Coefficient'
 'Income Share Top 10%' 'Inflation (%)']


In [27]:
# Unemployment

# INDICATORS:
# Unemployment, total (% of total labor force)
# Unemployment, youth (% ages 15-24)

unemployment = wb_dev2[['Country Name', '2023_Unemployment, total (% of total labor force) (modeled ILO estimate)', '2023_Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)']]
# rename columns
indicator_cols = ['Unemployment (%)', 'Youth Unemployment (%)']
unemployment.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, unemployment, indicator_cols)

['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)' 'Gini Coefficient'
 'Income Share Top 10%' 'Inflation (%)' 'Unemployment (%)'
 'Youth Unemployment (%)']


In [28]:
# Energy Security & Infrastructure

# INDICATORS:
# Access to electricity (% of population)
# Renewable energy consumption (% of total final energy consumption)

energy = wb_dev2[['Country Name', '2022_Access to electricity (% of population)', '2021_Renewable energy consumption (% of total final energy consumption)']]
# rename columns
indicator_cols = ['Electricity Access (%)', 'Renewable Energy Consumption (%)']
energy.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, energy, indicator_cols)

['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)' 'Gini Coefficient'
 'Income Share Top 10%' 'Inflation (%)' 'Unemployment (%)'
 'Youth Unemployment (%)' 'Electricity Access (%)'
 'Renewable Energy Consumption (%)']


In [29]:
# Internet Access & Security

# INDICATORS:
# Individuals using the Internet (% of population)
# Secure Internet servers (per 1 million people)

internet = wb_dev2[['Country Name', '2022_Individuals using the Internet (% of population)', '2023_Secure Internet servers (per 1 million people)']]
# rename columns
indicator_cols = ['Internet Users (%)', 'Secure Internet Servers']
internet.columns = ['Country'] + indicator_cols
# merge
eco_data = merge(eco_data, internet, indicator_cols)

['Country' 'Government Debt (% of GDP)'
 'Total Reserves (% of External Debt)' 'Gini Coefficient'
 'Income Share Top 10%' 'Inflation (%)' 'Unemployment (%)'
 'Youth Unemployment (%)' 'Electricity Access (%)'
 'Renewable Energy Consumption (%)' 'Internet Users (%)'
 'Secure Internet Servers']


### Global & Regional Threats


In [30]:
threat_data = pd.DataFrame(my_cs.all_countries(), columns=['Country'])

In [31]:
# Geopolitical Tensions

# INDICATORS:
# Projected Conflict Probability
# Current Conflict Intensity

conflict = pd.read_csv("data/Conflict_Risk_and_Intensity_2024.csv")
# standardise country names
conflict['Country'] = my_cs.standardise_countries(conflict['Country'], fuzzy_threshold=80)
# rename columns
indicator_cols = ['Future Conflict Risk', 'Current Conflict Intensity']
conflict.columns = ['Country'] + indicator_cols
# merge
threat_data = merge(threat_data, conflict, indicator_cols)

Using special case for 'Congo DR' -> 'Congo, Democratic Republic of the'
Using special case for 'Korea DPR' -> 'Korea, Democratic People's Republic of'
Using fuzzy match for 'Korea Republic of' -> 'Korea, Republic of'
Using fuzzy match for 'Lao PDR' -> 'Lao People's Democratic Republic'
Using fuzzy match for 'Micronesia' -> 'Micronesia, Federated States of'
Using fuzzy match for 'Moldova Republic of' -> 'Moldova, Republic of'
Using fuzzy match for 'Palestine' -> 'Palestine, State of'
['Country' 'Future Conflict Risk' 'Current Conflict Intensity']


In [32]:
# Pandemic Preparedness

# INDICATORS:
# Expenditure on COVID-19 per Capita in US$

disease = who_ghe[['Countries', '2021_Expenditure on COVID-19 per Capita in US$']]
# rename columns
indicator_cols = ['COVID-19 Expenditure per Capita (US$)']
disease.columns = ['Country'] + indicator_cols
# merge
threat_data = merge(threat_data, disease, indicator_cols)

['Country' 'Future Conflict Risk' 'Current Conflict Intensity'
 'COVID-19 Expenditure per Capita (US$)']


In [33]:
# Disease Burden

# INDICATORS:
# Prevalence of infectious diseases

disease = pd.read_csv("data/IHME_GBD_infectious_diseases_export_2021.csv")
# standardise country names
disease['Location'] = my_cs.standardise_countries(disease['Location'], fuzzy_threshold=79)
# rename columns
indicator_cols = ['Prevalence HIV/AIDS', 'Prevalence TB/Respiratory', 'Prevalence Malaria/Tropical', 'Prevalence Other Infectious']
disease.columns = ['Country'] + indicator_cols
# merge
threat_data = merge(threat_data, disease, indicator_cols)

Using fuzzy match for 'Cape Verde' -> 'Cabo Verde'
Using fuzzy match for 'Cote d'Ivoire' -> 'Côte d'Ivoire'
Using special case for 'Democratic Republic of Congo' -> 'Congo, Democratic Republic of the'
Using fuzzy match for 'Iran (Islamic Republic of)' -> 'Iran, Islamic Republic of'
Using fuzzy match for 'Libyan Arab Jamahiriya' -> 'Libya'
Using fuzzy match for 'Republic of Congo' -> 'Congo'
Using fuzzy match for 'Republic of Korea' -> 'Korea, Republic of'
Using special case for 'Turkey' -> 'Türkiye'
['Country' 'Future Conflict Risk' 'Current Conflict Intensity'
 'COVID-19 Expenditure per Capita (US$)' 'Prevalence HIV/AIDS'
 'Prevalence TB/Respiratory' 'Prevalence Malaria/Tropical'
 'Prevalence Other Infectious']


In [34]:
# env_data
# pol_data
# soc_data
# eco_data
# threat_data

In [42]:
# save preprocessed data
all_data = (env_data
            .merge(pol_data, on='Country', how='outer')
            .merge(soc_data, on='Country', how='outer')
            .merge(eco_data, on='Country', how='outer')
            .merge(threat_data, on='Country', how='outer'))

all_data.to_csv('data/processing/preprocessed_all_data.csv', index=False)
env_data.to_csv('data/processing/preprocessed_env_data.csv', index=False)
pol_data.to_csv('data/processing/preprocessed_pol_data.csv', index=False)
soc_data.to_csv('data/processing/preprocessed_soc_data.csv', index=False)
eco_data.to_csv('data/processing/preprocessed_eco_data.csv', index=False)
threat_data.to_csv('data/processing/preprocessed_threat_data.csv', index=False)

# store the category->indicator mappings
category_columns = {
    'env': env_data.drop(columns=['Country']).columns.tolist(),
    'pol': pol_data.drop(columns=['Country']).columns.tolist(),
    'soc': soc_data.drop(columns=['Country']).columns.tolist(),
    'eco': eco_data.drop(columns=['Country']).columns.tolist(),
    'threat': threat_data.drop(columns=['Country']).columns.tolist()
}
# save to JSON file
with open('data/processing/category_indicator_map.json', 'w') as json_file:
    json.dump(category_columns, json_file, indent=4)

# save intermediate data
wb_dev1.to_csv('data/processing/preprocessed_wb_dev1.csv', index=False)
wb_dev2.to_csv('data/processing/preprocessed_wb_dev2.csv', index=False)
wb_gov.to_csv('data/processing/preprocessed_wb_gov.csv', index=False)
who_ghe.to_csv('data/processing/preprocessed_who_ghe.csv', index=False)
wb_edu.to_csv('data/processing/preprocessed_wb_edu.csv', index=False)