# Global Societal Endangerment Index (GSEI)

Development notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from fuzzywuzzy import process
import matplotlib.pyplot as plt
import pycountry

# Data Loading and Preprocessing
Bring datasets into the right format and perform initial transformations to form the base indicators.

Total indicator count to load: 9 + 7 + 8 + 11 + 4 = 39

In [82]:
"""
Country standardisation functions
"""

def all_countries():
    return [c.name for c in pycountry.countries]

def lookup_country(country_name: str, fuzzy_threshold=80):
    try:
        return pycountry.countries.lookup(country_name).name
    except LookupError: pass
    
    countries = all_countries()
    c_match = [c.casefold() for c in countries]
    try:
        if country_name.casefold() in c_match:
            return countries[c_match.index(country_name.casefold())]
    except Exception: pass
    
    # special cases
    if country_name == 'Canary Islands' or country_name == 'SPI':
        print(f"Using special case for '{country_name}' -> 'Spain'")
        return 'Spain'
    elif country_name == 'Turkey':
        print(f"Using special case for '{country_name}' -> 'Türkiye'")
        return 'Türkiye'
    if country_name == 'Channel Islands' or country_name == 'CHI':
        print(f"Using special case for '{country_name}' -> 'United Kingdom'")
        return 'United Kingdom'
    if country_name == 'Kosovo' or country_name == 'XKX':
        print(f"Using special case for '{country_name}' -> 'Serbia'")
        return 'Serbia'
    
    # fuzzy match
    try:
        best_match, score = process.extractOne(country_name, countries)
        if score > fuzzy_threshold:
            print(f"Using fuzzy match for '{country_name}' -> '{best_match}'")
            return best_match
        else:
            raise LookupError(f"Country '{country_name}' not found (best fuzzy match too low: {best_match} ({score}))")
    except LookupError as e:
        print(e)
        return 'aaa.Unknown'

def standardise_countries(country_col: pd.Series, fuzzy_threshold=80):
    return country_col.map(lambda c: lookup_country(c, fuzzy_threshold=fuzzy_threshold))

In [79]:
def transpose_world_bank_data(wb_df, year_range):
    """
    Transpose World Bank data from long to wide format.
    """
    # clean up rows
    codes_to_remove = ['CHI','XKX', 
    'AFE','AFW','ARB','CSS','CEB','EAR','EAS','EAP','TEA','EMU','ECS','ECA','TEC','EUU','FCS','HPC','HIC','IBD','IBT','IDB','IDX','IDA','LTE','LCN','LAC','TLA','LDC','LMY','LIC','LMC','MEA','MNA','TMN','MIC','NAC','INX','OED','OSS','PSS','PST','PRE','SST','SAS','TSA','SSF','SSA','TSS','UMC','WLD']
    wb_df = wb_df[~wb_df['Country Code'].isin(codes_to_remove)]
    wb_df = wb_df[~wb_df['Country Name'].str.contains("Data from", na=False)]
    wb_df = wb_df[~wb_df['Country Name'].str.contains("Last Updated", na=False)]
    wb_df = wb_df.dropna(how='all', axis=0)  # drop completely empty rows
    wb_df = wb_df.replace('..', np.nan)  # replace missing values with NaN
    # clean up columns
    wb_df['Country Name'] = standardise_countries(wb_df['Country Code'], fuzzy_threshold=100)
    wb_df = wb_df.drop(columns=['Country Code', 'Series Code'])
    wb_df.columns = [col.split(" [")[0] if "[YR" in col else col for col in wb_df.columns]
    # Pivot to wide format
    wb_df = wb_df.pivot(index='Country Name', columns='Series Name', values=[str(year) for year in year_range])
    wb_df.columns = ['_'.join(col).strip() for col in wb_df.columns.values]
    wb_df = wb_df.reset_index()
    return wb_df

def indicator_yearly_availability(processed_wb_df):
    """
    Calculate the availability of data for each indicator and year.
    """
    # Select indicator columns
    indicator_cols = processed_wb_df.columns[1:]
    # Calculate availability (non-null percentage)
    availability = processed_wb_df[indicator_cols].notnull().mean()
    # Reshape to DataFrame
    availability_df = availability.to_frame(name='value').reset_index()
    availability_df.rename(columns={'index': 'indicator'}, inplace=True)
    # Split indicator column
    availability_df[['year', 'indicatorName']] = availability_df['indicator'].str.split('_', n=1, expand=True)
    # Drop original indicator column
    availability_df.drop('indicator', axis=1, inplace=True)
    # Pivot to desired format
    return availability_df.pivot(index='indicatorName', columns='year', values='value').reset_index()

### Environmental Risks


In [4]:
# start with comprehensive country list
env_data = pd.DataFrame(all_countries(), columns=['Country'])

env_data

Unnamed: 0,Country
0,Aruba
1,Afghanistan
2,Angola
3,Anguilla
4,Åland Islands
...,...
244,Samoa
245,Yemen
246,South Africa
247,Zambia


In [5]:
# Climate Change Vulnerability

# Maximum relative temperature change (°C) in the last 10 years, compared to a 1951-1980 baseline
temp_change = pd.read_csv("data/UN_FAO_climate_change_indicators.csv")
# drop row with Country 'world' in temp_change
temp_change = temp_change[temp_change['Country'] != 'World']
# standardise country names
temp_change['Country'] = standardise_countries(temp_change['ISO3'])
# INDICATOR: calculate maximum value between F2012 and F2022
temp_change['Temp Change max 2012-2022'] = temp_change[['F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].max(axis=1)
# merge
if 'Temp Change max 2012-2022' not in env_data.columns:
    env_data = env_data.merge(temp_change[['Country', 'Temp Change max 2012-2022']], on='Country', how='left')

print(env_data.columns.values)

['Country' 'Temp Change max 2012-2022']


In [6]:
# Exposure to Natural Disasters (earthquakes, floods, hurricanes)

disasters = pd.read_excel("data/EM-DAT_natural_disasters_ALL_2020-2025.xlsx")
# standardise country names
disasters['Country'] = standardise_countries(disasters['ISO'])
# group by country, summing up INDICATORS: affected people + total damage
disasters_grouped = disasters.groupby('Country')[['No. Affected', 'Total Damage, Adjusted (\'000 US$)']].sum().reset_index()
disasters_grouped['Total Damage, Adjusted (\'000 US$)'] = disasters_grouped['Total Damage, Adjusted (\'000 US$)'] * 1000
# rename columns
disasters_grouped.columns = ['Country', 'Disaster Affected Population 2020-2025', 'Disaster Damage US$ 2020-2025']
# merge
if 'Disaster Affected Population 2020-2025' not in env_data.columns:
    env_data = env_data.merge(disasters_grouped, on='Country', how='left')

print(env_data.columns.values)

Using special case for 'SPI' -> 'Spain'
Using special case for 'SPI' -> 'Spain'
['Country' 'Temp Change max 2012-2022'
 'Disaster Affected Population 2020-2025' 'Disaster Damage US$ 2020-2025']


In [7]:
# Air and Water Pollution Levels

# Years of lost life due to unsafe water, sanitation, and handwashing
# Years of lost life due to air pollution

air_water = pd.read_csv("data/IHME_GBD_environmental_risk_export_2021.csv")
# standardise country names
air_water['Location'] = standardise_countries(air_water['Location'], fuzzy_threshold=79)
# rename INDICATORS: Years of lost life due to unsafe water, sanitation, and handwashing + Years of lost life due to air pollution
air_water.columns = ['Country', 'Unsafe water, sanitation (YLL)', 'Air pollution (YLL)']
# merge
if 'Unsafe water, sanitation (YLL)' not in env_data.columns:
    env_data = env_data.merge(air_water[['Country', 'Unsafe water, sanitation (YLL)', 'Air pollution (YLL)']], on='Country', how='left')

print(env_data.columns.values)

Using fuzzy match for 'Cape Verde' -> 'Cabo Verde'
Using fuzzy match for 'Cote d'Ivoire' -> 'Côte d'Ivoire'
Using fuzzy match for 'Democratic Republic of Congo' -> 'Congo, The Democratic Republic of the'
Using fuzzy match for 'Iran (Islamic Republic of)' -> 'Iran, Islamic Republic of'
Using fuzzy match for 'Libyan Arab Jamahiriya' -> 'Libya'
Using fuzzy match for 'Republic of Congo' -> 'Congo'
Using fuzzy match for 'Republic of Korea' -> 'Korea, Republic of'
Using special case for 'Turkey' -> 'Türkiye'
['Country' 'Temp Change max 2012-2022'
 'Disaster Affected Population 2020-2025' 'Disaster Damage US$ 2020-2025'
 'Unsafe water, sanitation (YLL)' 'Air pollution (YLL)']


In [86]:
wb_dev2 = pd.read_csv("data/Worldbank_development_2.csv")
wb_dev2 = transpose_world_bank_data(wb_dev2, range(2014, 2024))

wb_dev2

Unnamed: 0,Country Name,2014_Access to electricity (% of population),"2014_External debt stocks, total (DOD, current US$)",2014_Individuals using the Internet (% of population),"2014_Inflation, consumer prices (annual %)",2014_People using safely managed drinking water services (% of population),2014_Prevalence of moderate or severe food insecurity in the population (%),2014_Prevalence of undernourishment (% of population),2014_Renewable energy consumption (% of total final energy consumption),2014_Renewable internal freshwater resources per capita (cubic meters),...,"2023_Inflation, consumer prices (annual %)",2023_People using safely managed drinking water services (% of population),2023_Prevalence of moderate or severe food insecurity in the population (%),2023_Prevalence of undernourishment (% of population),2023_Renewable energy consumption (% of total final energy consumption),2023_Renewable internal freshwater resources per capita (cubic meters),2023_Secure Internet servers (per 1 million people),2023_Total reserves (% of total external debt),"2023_Unemployment, total (% of total labor force) (modeled ILO estimate)","2023_Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"
0,Afghanistan,89.5,2529865267.8,7,4.67399603536305,22.9443007139289,,19.3,19.1,1437.82776335935,...,,,,,,,43.758544404586,,13.991,17.291
1,Albania,100,8512452310,54.3,1.6258650440261,70.5632610188294,,4.5,38.6,9310.84516168334,...,4.75976421930107,,,,,,1220.33290943972,56.806329270167,10.108,24.817
2,Algeria,99.3,5521188948.6,29.5,2.91692692067458,76.6182319353339,,2.7,0.1,286.876447055991,...,9.32217375928322,,,,,,100.294126063305,1110.23630387233,11.701,30.447
3,American Samoa,,,,,89.165529706952,,,0.2,,...,,,,,,,462.953220681383,,,
4,Andorra,100,,86.1,,90.6400003669046,,,19.4,4280.07648805891,...,,,,,,,14618.5811813595,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,"Virgin Islands, British",99.2,,,,,,,1,,...,,,,,,,1982403.48852123,,,
211,"Virgin Islands, U.S.",100,,50.07,,97.9360043349244,,,4.2,,...,,,,,,,305.003002373305,,12.32,25.824
212,Yemen,66.1,7723142216.8,22.55,8.10472583623915,,,32.4,0.8,69.4758992902508,...,,,,,,,6.95593912679964,,17.091,32.395
213,Zambia,27.9,9675902727,6.5,7.80687553566333,,,36,84.6,5045.51183792205,...,10.884531691282,,,,,,54.9605251697733,,5.905,9.752


In [84]:
# check availability of data for each indicator over the years
indicator_yearly_availability(wb_dev2)

year,indicatorName,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Access to electricity (% of population),0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.995349,0.0
1,"External debt stocks, total (DOD, current US$)",0.544186,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488,0.553488
2,Individuals using the Internet (% of population),0.934884,0.930233,0.944186,0.953488,0.813953,0.865116,0.893023,0.883721,0.851163,0.274419
3,"Inflation, consumer prices (annual %)",0.874419,0.869767,0.869767,0.846512,0.832558,0.832558,0.804651,0.804651,0.8,0.75814
4,People using safely managed drinking water ser...,0.637209,0.637209,0.64186,0.64186,0.637209,0.637209,0.637209,0.627907,0.613953,0.0
5,Prevalence of moderate or severe food insecuri...,0.0,0.455814,0.460465,0.539535,0.572093,0.627907,0.683721,0.706977,0.702326,0.0
6,Prevalence of undernourishment (% of population),0.795349,0.795349,0.795349,0.795349,0.795349,0.790698,0.795349,0.795349,0.795349,0.0
7,Renewable energy consumption (% of total final...,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.986047,0.330233,0.0
8,Renewable internal freshwater resources per ca...,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.851163,0.0,0.0
9,Secure Internet servers (per 1 million people),0.976744,0.986047,0.995349,0.995349,0.990698,0.986047,0.995349,1.0,1.0,1.0


#### Year selection for Worldbank Development Indicators (dataset 2)

Based on the availability of data for the Worldbank Development Indicators, one year is selected for each indicator that balances recency and data availability. The missing data will be handled in the imputation step. The selected years are:

- 2022 (99%) <- Access to electricity (% of population)
- 2023 (55%) <- "External debt stocks, total (DOD, current US$)"
- 2022 (85%) <- Individuals using the Internet (% of population)
- 2023 (75%) <- "Inflation, consumer prices (annual %)"
- 2022 (61%) <- People using safely managed drinking water services (% of population)
- 2022 (70%) <- Prevalence of moderate or severe food insecurity in the population (%)
- 2022 (80%) <- Prevalence of undernourishment (% of population)
- 2021 (98%) <- Renewable energy consumption (% of total final energy consumption)
- 2021 (85%) <- Renewable internal freshwater resources per capita (cubic meters)
- 2023 (100%) <- Secure Internet servers (per 1 million people)
- 2021 (44%) <- Total reserves (% of total external debt)
- 2023 (85%) <- "Unemployment, total (% of total labor force) (modeled ILO estimate)"
- 2023 (85%) <- "Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"


In [104]:
# Water Scarcity & Food Security

water_food = wb_dev2[['Country Name', '2022_People using safely managed drinking water services (% of population)', '2022_Prevalence of moderate or severe food insecurity in the population (%)', '2022_Prevalence of undernourishment (% of population)', '2021_Renewable internal freshwater resources per capita (cubic meters)']]
water_food.columns = ['Country', 'Safe Drinking Water (%)', 'Food Insecurity (%)', 'Undernourishment (%)', 'Renewable Freshwater per Capita (m3)']
numeric_cols = water_food.drop('Country', axis=1)
numeric_cols = numeric_cols.apply(pd.to_numeric, errors='coerce')
water_food = pd.concat([water_food['Country'], numeric_cols], axis=1)
water_food

Unnamed: 0,Country,Safe Drinking Water (%),Food Insecurity (%),Undernourishment (%),Renewable Freshwater per Capita (m3)
0,Afghanistan,30.034098,80.9,30.4,1178.737859
1,Albania,70.736068,32.2,4.5,9567.281462
2,Algeria,70.597934,18.9,2.5,251.267289
3,American Samoa,,,,
4,Andorra,90.640001,,,4027.359502
...,...,...,...,...,...
210,"Virgin Islands, British",,,,
211,"Virgin Islands, U.S.",,,,
212,Yemen,,72.5,39.5,56.542461
213,Zambia,,,35.4,4091.083850


In [100]:
# merge
if 'Safe Drinking Water (%)' in env_data.columns:
    env_data = env_data.drop(columns=water_food.columns)
env_data = env_data.merge(water_food, on='Country', how='left')

print(env_data.columns.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  water_food[numeric_cols.columns] = numeric_cols.astype('float')


KeyError: 'Country'

In [97]:
env_data['Safe Drinking Water (%)']

0                   NaN
1      30.0340981310563
2                   NaN
3                   NaN
4                   NaN
             ...       
244    62.1917137502586
245                 NaN
246                 NaN
247                 NaN
248    26.5164278963248
Name: Safe Drinking Water (%), Length: 249, dtype: object

In [90]:
env_data

Unnamed: 0,Country,Temp Change max 2012-2022,Disaster Affected Population 2020-2025,Disaster Damage US$ 2020-2025,"Unsafe water, sanitation (YLL)",Air pollution (YLL),Safe Drinking Water (%),Food Insecurity (%),Undernourishment (%),Renewable Freshwater per Capita (m3)
0,Aruba,1.303,,,,,,,,
1,Afghanistan,2.012,13016058.0,0.000000e+00,1422.29,4283.98,30.0340981310563,80.9,30.4,1178.73785900005
2,Angola,1.752,4544311.0,0.000000e+00,1710.48,2429.79,,79.2,23.2,4285.82651976205
3,Anguilla,1.224,,,,,,,,
4,Åland Islands,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
244,Samoa,1.440,,,109.99,3146.24,62.1917137502586,23.6,5.4,
245,Yemen,,1885344.0,2.354600e+07,471.74,2859.66,,72.5,39.5,56.5424608302103
246,South Africa,1.811,12286813.0,3.910613e+09,1360.48,1806.03,,19.4,8.1,728.424453839783
247,Zambia,1.450,12283962.0,0.000000e+00,1903.05,3103.66,,,35.4,4091.08385002821


### Political Instability & Governance


In [None]:
# Governance Quality


In [None]:
# Regime Type


### Social Vulnerability


### Economic Instability & Infrastructure


### Global & Regional Threats


In [None]:
env_data

In [None]:
### Heatmap Correlation Matrix

# Create dummy data (replace with your actual data)
np.random.seed(42)
data = np.random.rand(100, 24)  # 100 rows, 24 columns
columns = ['Unemployment', 'Oil exports', 'Repression', 'Empowerment rights', 'Democracy', 'Lack of democracy',
           'State capacity', 'Corruption', 'Ethnic exclusion', 'Income inequality', 'Youth bulge',
           'Food security', 'Child mortality', 'GDP per capita, log', 'Structural constraints',
           'Neighboring conflict', 'Trade openness', 'Recent internal conflict', 'Years since last conflict',
           'Homicide rate', 'Temperature change', 'Droughts', 'Transnational ethnic ties', 'Population, log']
df = pd.DataFrame(data, columns=columns)

# Calculate correlation matrix
correlation_matrix = df.corr()

# Create heatmap with mask
plt.figure(figsize=(15, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, vmin=-1, vmax=1, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, mask=mask)
plt.title("Pairwise Correlations Between Variables")
plt.show()