# WDI plots

Before ingestion, using data downloaded from https://datatopics.worldbank.org/world-development-indicators/ as CSV. 
Currently, all of the plots are created using the same plot function just to get an overview of trends. The plot type, labels or interpolation might not be suitable for all indicators. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Butchering to keep only rows with country level and world data (no continents, income groups, etc.)
df = pd.read_csv('WDICSV.csv', skiprows=range(1, 71810))

# identify columns that represent years
year_columns = df.columns[4:]  

# transform df to long-form
df_long = pd.melt(df, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], 
                  value_vars=year_columns, 
                  var_name='Year', 
                  value_name='Value').assign(Year=lambda x: pd.to_numeric(x['Year']))

# remove erroneous data for Tonga in 2002
ton_conditions = ((df_long['Country Code'] == 'TON') & 
              (df_long['Year'] == 2002) & 
              (df_long['Indicator Code'].isin(['SE.SEC.ENRR.MA', 'SE.SEC.ENRR'])))
df_long.loc[ton_conditions, 'Value'] = np.nan

# back to wide format for easier plotting
df_wide = df_long.pivot_table(index=['Country Name', 'Country Code', 'Year'], 
                              columns='Indicator Code',
                              values='Value').reset_index()

# Display to verify (we should have 1498 columns)
print(df_wide.head())

### Spaghetti Plot function

In [None]:
def plot_indicator(df_wide, indicator_code, selected_countries, start_year=1980, interpolate=True, world_value=True):
    
    """
    
    Plot a specified indicator over time for all countries, with selected countries and the world average highlighted.

    The function plots the trend of a specified indicator from a given starting year onward.
    All available countries are shown in light grey, with selected countries highlighted in distinct colors.
    A dashed black line represents the world value for that indicator.

    When including interpolation, missing data points within each country's series are handled with log-linear interpolation 
    for gaps up to 5 years, within the range of existing data.

    Parameters:
    ----------
    df_wide : pd.DataFrame
        The wide-format DataFrame containing yearly data for multiple indicators. 
        Must contain columns for 'Country Name', 'Country Code', 'Year', and the specified indicator.
    
    df_long : pd.DataFrame
        The long-format DataFrame containing yearly data for multiple indicators, with separate
        columns for 'Indicator Code' and 'Indicator Name'. This is used to retrieve the indicator's name.
    
    indicator_code : str
        The code of the WDI indicator(s) to plot.
    
    selected_countries : list of str
        List of country names to be highlighted in the plot. These countries will be shown 
        in distinct colors with labels, while other countries are displayed in light grey.
    
    start_year : int, optional (default=1980)
        The starting year for the plot. Data before this year will be excluded from the plot.

    interpolate (bool): 
        Whether to apply log-linear interpolation for missing data. Default=True.

    world_value (bool):
        Whether to include the world value for the indicator in the plot. Default=True. Not suitable for all indicators, such as absolute values

    Behavior:
    ----------
    -> Interpolates missing values within each country's series using log-linear interpolation, 
      filling gaps up to 5 years where data is missing. Only data points within the range 
      of existing data (from the first to last valid entry) are interpolated.
    -> Uses the world value for the indicator directly from the dataset (rows where 'Country Code' is 'WLD').
    -> The plot displays each country’s data with light grey lines, highlights selected countries 
      with labeled lines, and includes a dashed black line for the world.

    Example Usage:
    ----------
        selected_countries = ['Afghanistan', 'United States', 'China', 'India']
        plot_indicator(df_wide, df_long, 'SE.SEC.ENRR', selected_countries, start_year=1990, interpolate=False)
    // 
        selected_countries = ['United States', 'Ethiopia', 'China', 'Iraq']
        plot_indicator(df_wide, df_long, 'NY.GDP.PCAP.KD', selected_countries, start_year=1980, interpolate=True)

    """

    # Extract indicator name from the long-format df
    indicator_name = df_long[df_long['Indicator Code'] == indicator_code]['Indicator Name'].iloc[0]
    
    # Filter for years from start_year onwards and exclude 'WLD' country code
    df_filtered = df_wide.loc[df_wide['Year'] >= start_year, ['Country Name', 'Country Code', 'Year', indicator_code]]

    # Interpolation
    if interpolate:
        df_filtered[indicator_code] = (
            df_filtered[indicator_code]
            .where(df_filtered[indicator_code] > 0, np.nan)
            .apply(np.log)
            .groupby(df_filtered['Country Code'], group_keys=False)
            .apply(lambda group: group.interpolate(method='linear', limit_area='inside'))
            .apply(np.exp)
        )

    # Separate world data, or create average if world_value=False
    if world_value:
        world_data = df_filtered[df_filtered['Country Code'] == 'WLD']
    else:
        no_world_data = df_filtered[df_filtered['Country Code'] != 'WLD']
        world_data = no_world_data.groupby('Year')[indicator_code].mean().reset_index()

    # Plot all other countries in grey
    plt.figure(figsize=(12, 7))
    for country, country_data in df_filtered.groupby('Country Name'):
        plt.plot(
            country_data['Year'], 
            country_data[indicator_code], 
            color='lightgrey', 
            linewidth=0.6, 
            alpha=0.9
        )

    # Highlight selected countries
    for country in selected_countries:
        country_data = df_filtered[df_filtered['Country Name'] == country]
        plt.plot(country_data['Year'], country_data[indicator_code], label=country, linewidth=2.5)
    
    # Plot the world
    if world_value:
        plt.plot(world_data['Year'], world_data[indicator_code], label='World', color='black', linestyle='--', linewidth=2.5)
    else:
        plt.plot(world_data['Year'], world_data[indicator_code], label='Average', color='black', linestyle='--', linewidth=2.5)
        
    years=df_filtered['Year'][df_filtered[indicator_code].notnull()]
    tick_gap = np.round((years.max() - years.min()) / (5)).astype(int)
    ticks = [round(years.min() + i * tick_gap) for i in range(6)]
    plt.xticks(ticks, fontsize=12)
    plt.yticks(fontsize=12)
    
    # Customize
    plt.ylabel(indicator_name, fontsize=12) 
    plt.title(f'{indicator_name} Over Time by Country', fontsize=12)

    plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, title_fontsize=14)
    plt.grid(True)    

In [None]:
# Define selected countries
selected_countries = ['Afghanistan', 'United States', 'China', 'India', 'Chile', 'Ethiopia', 'Indonesia', 'Nigeria', 'Russia', 'Saudi Arabia']

# List of indicator codes to plot
indicator_codes = [
'SN.ITK.DEFC.ZS',
'SH.H2O.SMDW.ZS',
'EG.ELC.ACCS.ZS',
'EG.ELC.ACCS.UR.ZS',
'SL.GDP.PCAP.EM.KD',
'EG.EGY.PRIM.PP.KD',
'MS.MIL.XPND.ZS',  
'SL.TLF.TOTL.FE.ZS', 
'SM.POP.TOTL.ZS', 
'SP.DYN.IMRT.IN', 
'SH.DYN.MORT.FE',
'SP.POP.0014.FE.ZS', 
'SP.POP.1564.FE.ZS', 
'SP.POP.65UP.FE.ZS', 
'SP.URB.TOTL.IN.ZS',
'SM.POP.REFG.OR', 
'SP.DYN.IMRT.FE.IN', 
'NY.GDP.PCAP.KD', 
'SP.DYN.LE00.IN', 
'SE.PRM.NENR']

# Loop through each indicator code and plot
for indicator_code in indicator_codes:
    plot_indicator(df_wide, indicator_code, selected_countries)

### other viewser indicators

In [None]:
# Define the selected countries
selected_countries = ['Afghanistan', 'United States', 'China', 'India', 'Chile', 'Ethiopia', 'Indonesia', 'Nigeria', 'Russia', 'Saudi Arabia']

# List of indicator codes to plot
indicator_codes = [
 'SP.POP.65UP.MA.ZS', 
 'SP.POP.DPND', 
 'SE.SEC.NENR', 
 'SH.H2O.BASW.ZS',  
 'SP.DYN.TFRT.IN', 
 'SL.UEM.ADVN.MA.ZS', 
 'SP.POP.1564.MA.ZS', 
 'SP.POP.0014.MA.ZS']

# Loop through each indicator code and plot
for indicator_code in indicator_codes:
    plot_indicator(df_wide, indicator_code, selected_countries)

##### Log-linear interpolation not suitable 

In [None]:
# Define the selected countries
selected_countries = ['Afghanistan', 'United States', 'China', 'India', 'Chile', 'Ethiopia', 'Indonesia', 'Nigeria', 'Russia', 'Saudi Arabia']

# Indicator codes to plot
indicator_codes_zeroes = [
'SE.SEC.ENRR', 
'SE.SEC.CUAT.LO.ZS', 
'SE.SEC.ENRR.FE', 
'SE.SEC.ENRR.MA', 
'SH.STA.SMSS.ZS', 
'EG.CFT.ACCS.ZS', 
'ER.MRN.PTMR.ZS', 
'VC.IHR.PSRC.P5', 
'DT.ODA.ODAT.GN.ZS', 
'EN.ATM.PM25.MC.T2.ZS', 
'SE.ENR.PRIM.FM.ZS', 
'SE.ENR.PRSC.FM.ZS', 
'SH.STA.STNT.ZS', 
'SH.STA.MALN.ZS', 
'SM.POP.NETM', 
'SP.POP.GROW', 
'NY.ADJ.DFOR.CD', 
'EN.GHG.CO2.MT.CE.AR5', 
'EN.GHG.CO2.IP.MT.CE.AR5', 
'AG.LND.FRST.K2', 
'ER.FSH.CAPT.MT', # threatened fish species only available for 1 single year, switching to tonnes of fish captured (but then also less meaningful for SDG14)
'SP.M15.2024.FE.ZS'] #(world data not available, and too much missing data to calculate a meaningful world average)

for indicator_code in indicator_codes_zeroes:
    plot_indicator(df_wide, indicator_code, selected_countries, interpolate=False)

### Lots of missing data/no data for world:

In [None]:
indicator_codes = [
'NY.GDP.MKTP.PP.KD',
'NY.GDP.MKTP.CD',
'NY.GNP.MKTP.PP.KD', 
'IQ.CPA.ENVR.XQ', # lots of missing data
'IQ.CPA.PUBS.XQ', # lots of missing data
'SG.DMK.SRCR.FN.ZS', # world data not available, and too much missing data to calculate a meaningful world average
'EN.GHG.ALL.MT.CE.AR5',
'SP.POP.TOTL', 
'NV.AGR.TOTL.CN', 
'NV.AGR.TOTL.KD',
'NV.AGR.TOTL.KN',  
'AG.LND.TOTL.RU.K2',
'AG.LND.TOTL.K2',
'AG.SRF.TOTL.K2', 
'SL.UEM.ADVN.FE.ZS', 
'SL.UEM.NEET.ZS', 
'SL.UEM.NEET.FE.ZS', 
'SL.UEM.ADVN.ZS',
'NV.MNF.TECH.ZS.UN' 
] 

# Loop through each indicator code and plot
for indicator_code in indicator_codes:
    plot_indicator(df_wide, indicator_code, selected_countries, world_value=False)

#### Table of indicator codes used and corresponding names 

In [None]:
used_indicator_codes = [
'SN.ITK.DEFC.ZS', 
'SE.SEC.ENRR',
'SE.SEC.CUAT.LO.ZS',
'SE.SEC.ENRR.FE',
'SE.SEC.ENRR.MA',
'SH.H2O.SMDW.ZS',
'SH.STA.SMSS.ZS',
'EG.ELC.ACCS.ZS',
'EG.ELC.ACCS.UR.ZS',
'EG.CFT.ACCS.ZS',
'SL.GDP.PCAP.EM.KD',
'SG.DMK.SRCR.FN.ZS',
'ER.MRN.PTMR.ZS',
'VC.IHR.PSRC.P5',
'DT.ODA.ODAT.GN.ZS',
'EG.EGY.PRIM.PP.KD', 
'EN.ATM.PM25.MC.T2.ZS',
'MS.MIL.XPND.GD.ZS', 
'MS.MIL.XPND.ZS', 
'SE.ENR.PRIM.FM.ZS', 
'SE.ENR.PRSC.FM.ZS', 
'SH.STA.STNT.ZS', 
'SH.STA.MALN.ZS',
'SL.TLF.TOTL.FE.ZS', 
'SM.POP.NETM', 
'SM.POP.TOTL.ZS', 
'SP.DYN.IMRT.IN', 
'SH.DYN.MORT.FE',
'SP.POP.0014.FE.ZS', 
'SP.POP.1564.FE.ZS', 
'SP.POP.65UP.FE.ZS', 
'SP.POP.GROW',
'SP.URB.TOTL.IN.ZS',
'SM.POP.REFG.OR', 
'SP.DYN.IMRT.FE.IN', 
'NY.GDP.PCAP.KD', 
'SP.DYN.LE00.IN', 
'SE.PRM.NENR', 
'NY.ADJ.DFOR.CD', 
'SP.POP.65UP.MA.ZS', 
'SP.POP.DPND', 
'SE.SEC.NENR', 
'SH.H2O.BASW.ZS',  
'SP.DYN.TFRT.IN', 
'SL.UEM.ADVN.MA.ZS', 
'SP.POP.1564.MA.ZS', 
'SL.UEM.ADVN.FE.ZS', 
'SL.UEM.NEET.ZS', 
'SL.UEM.NEET.FE.ZS', 
'SP.POP.0014.MA.ZS', 
'SL.UEM.ADVN.ZS',
'NY.GDP.MKTP.PP.KD',
'NY.GDP.MKTP.CD',
'NY.GNP.MKTP.PP.KD', 
'EN.GHG.ALL.MT.CE.AR5',
'EN.GHG.CO2.MT.CE.AR5',
'EN.GHG.CO2.IP.MT.CE.AR5',
'IQ.CPA.ENVR.XQ',
'IQ.CPA.PUBS.XQ',
'SP.POP.TOTL', 
'NV.AGR.TOTL.CN', 
'NV.AGR.TOTL.KD',
'NV.AGR.TOTL.KN',  
'AG.LND.FRST.K2', 
'AG.LND.TOTL.RU.K2',
'AG.LND.TOTL.K2',
'AG.SRF.TOTL.K2', 
'ER.FSH.CAPT.MT',
'NV.MNF.TECH.ZS.UN',
'SP.M15.2024.FE.ZS'] 

# Filter for indicator table
indicator_table = df_long[df_long['Indicator Code'].isin(used_indicator_codes)][['Indicator Code', 'Indicator Name']].drop_duplicates()

# Reset index for readability
indicator_table = indicator_table.reset_index(drop=True)

# Display
pd.set_option('display.max_rows', None) 
display(indicator_table)

### Bubble plot function

In [None]:
def plot_bubble(df_long, df_wide, x_indicator_code, y_indicator_code, size_indicator_code, year, selected_countries):
    try:
        # Get indicator names
        x_indicator_name = df_long[df_long['Indicator Code'] == x_indicator_code]['Indicator Name'].iloc[0]
        y_indicator_name = df_long[df_long['Indicator Code'] == y_indicator_code]['Indicator Name'].iloc[0]
        size_indicator_name = df_long[df_long['Indicator Code'] == size_indicator_code]['Indicator Name'].iloc[0]
        
        # Filter for specified year and exclude 'WLD' country code
        df_filtered = df_wide[df_wide['Year'] == year][['Country Name', 'Country Code', x_indicator_code, y_indicator_code, size_indicator_code]]
        df_filtered = df_filtered[df_filtered['Country Code'] != 'WLD']
        df_filtered = df_filtered.dropna(subset=[x_indicator_code, y_indicator_code, size_indicator_code])
        
        # Adding flag for selected countries
        df_filtered['is_selected'] = df_filtered['Country Name'].isin(selected_countries)

        # Plot the bubble plot
        plt.figure(figsize=(12, 7))
        plt.scatter(
            df_filtered[x_indicator_code], 
            df_filtered[y_indicator_code], 
            s=df_filtered[size_indicator_code] / 100000, 
            alpha=0.4, 
            edgecolors="black", 
            linewidth=0.9
        )
        
        # Highlight selected countries
        for country in selected_countries:
            country_data = df_filtered[df_filtered['Country Name'] == country]
            if not country_data.empty:
                plt.scatter(
                    country_data[x_indicator_code], 
                    country_data[y_indicator_code], 
                    s=country_data[size_indicator_code] / 100000, 
                    label=country, 
                    alpha=0.45,
                    edgecolors="black", 
                    linewidth=0.9
                )
                # Add country code text next to bubble (with offset)
                for i, row in country_data.iterrows():
                    plt.text(row[x_indicator_code] * 1.005, row[y_indicator_code], row['Country Code'], fontsize=10, ha='left')

        # axis limits
        plt.xlim(0, df_filtered[x_indicator_code].max() * 1)
        plt.ylim(0, df_filtered[y_indicator_code].max() * 1.2)

        # Customize plot
        plt.xlabel(x_indicator_name, fontsize=12)
        plt.ylabel(y_indicator_name, fontsize=12)
        plt.title(f'{x_indicator_name} vs {y_indicator_name} ({year})', fontsize=14)
        plt.grid(alpha=0.2)

    except Exception:
        print("An error occurred")

In [None]:
plot_bubble(df_long, df_wide, 'NY.GDP.PCAP.KD', 'SE.SEC.ENRR', 'SP.POP.TOTL', 2022, ['United States', 'China', 'India', 'Norway', 'Ethiopia', 'Nigeria', 'Afghanistan', 'Singapore', 'Brazil', 'United Arab Emirates'])
plot_bubble(df_long, df_wide, 'NY.GDP.PCAP.KD', 'SP.DYN.IMRT.IN', 'SP.POP.TOTL', 2022, ['United States', 'China', 'India', 'Norway', 'Ethiopia', 'Nigeria', 'Afghanistan', 'Singapore', 'Brazil', 'United Arab Emirates'])
plot_bubble(df_long, df_wide, 'NY.GDP.PCAP.KD', 'SL.TLF.TOTL.FE.ZS', 'SP.POP.TOTL', 2022, ['United States', 'China', 'India', 'Norway', 'Ethiopia', 'Nigeria', 'Afghanistan', 'Singapore', 'Brazil', 'United Arab Emirates'])

## V-Dem Regime Types – Stacked area chart

In [None]:
df_vdem = pd.read_csv('V-Dem-CY-Core-v14.csv')

In [None]:
def plot_v2x_libdem_stacked_area(df_vdem, startyear=1900, show_proportion=True, show_world_average=False):
    """
    Plots the v2x_libdem variable over time as a stacked area chart with categories 
    'Autocratic', 'Electoral Authoritarian', 'Minimally Democratic' and 'Democratic',
    and optionally includes a secondary y-axis with the average index value per year. The categories are based on the version _4C in the V-Dem codebook 
    (v14 March 2024, p. 375, originally Lindberg (2016))

    -> df_vdem: DataFrame containing the V-Dem data as index (0-1)
    -> startyear: (Int) the starting year. Data are availablle from 1800, but default is 1900 since the earlier data are sparse
    -> show_proportion: (Bool) if True shows the proportion of countries on the y-axis, otherwise shows the absolute number of countries, default is True
    -> show_world_average: (Bool) if True includes a secondary y-axis with the world average index per year, default is False
    """
    
    # Categorize v2x_libdem values
    def categorize_democracy(value):
        if value <= 0.25:
            return 'Autocratic'
        elif value <= 0.5:
            return 'Electoral Authoritarian'
        elif value <= 0.75:
            return 'Minimally Democratic'
        else:
            return 'Democratic'

    df_vdem.loc[:, 'category'] = df_vdem['v2x_libdem'].apply(categorize_democracy)

    # Limit data to startyear onwards
    df_vdem = df_vdem[df_vdem['year'] >= startyear].copy()
    
    # Aggregate by year and category, and get world mean
    df_agg = df_vdem.groupby(['year', 'category']).size().unstack(fill_value=0)
    df_avg = df_vdem.groupby('year')['v2x_libdem'].mean()
    
    # Calculate proportion of countries
    if show_proportion:
        df_agg = df_agg.div(df_agg.sum(axis=1), axis=0)

    # Reorder columns 
    df_agg = df_agg[['Autocratic', 'Electoral Authoritarian', 'Minimally Democratic', 'Democratic']]

    # Plot the chart
    fig, ax1 = plt.subplots(figsize=(12, 8))
    df_agg.plot(kind='area', stacked=True, alpha=0.9, cmap='viridis', ax=ax1)

    # Customize primary y-axis
    ax1.set_xlabel('Year', fontsize=12)
    ax1.set_ylabel('Proportion of Countries' if show_proportion else 'Number of Countries', fontsize=12)
    ax1.set_title('Aggregated Regime Types (V-Dem)', fontsize=14)
    ax1.grid(True, alpha=0.25)
    # (reorder legend to match the chart order)
    handles, labels = ax1.get_legend_handles_labels()
    order = [3, 2, 1, 0]
    ax1.legend([handles[idx] for idx in order], [labels[idx] for idx in order], title='Category', bbox_to_anchor=(1.05, 1))

    # adjust axis limits to remove buffer/margin
    ax1.set_xlim(df_agg.index.min(), df_agg.index.max())
    if show_proportion:
        ax1.set_ylim(0, df_agg.max().max() * 1.14)
    else:
        ax1.set_ylim(0, df_agg.max().max() * 1.4)

    # secondary y-axis if world average is included
    if show_world_average:
        ax2 = ax1.twinx()
        ax2.plot(df_avg.index, df_avg.values, color='black', linestyle='--', linewidth=2, label='Average Index Value (0-1)')
        ax2.set_ylim(0, 1) 
        ax2.set_ylabel('Average Index', fontsize=12)
        ax2.grid(True, alpha=0.25)
        ax2.legend(loc='upper left')

#Examples:
plot_v2x_libdem_stacked_area(df_vdem, startyear=1850, show_proportion=False, show_world_average=False)
plot_v2x_libdem_stacked_area(df_vdem, show_proportion=True, show_world_average=True)