In [1]:
import pandas as pd

In [65]:
# LOAD DATA
# replication data from 2021 paper 
# country metadata and ground truth
country_metadata = pd.read_csv("../data/Analysis/data/tables/countries.csv")
actual_outcomes = pd.read_csv("../data/Analysis/data/tables/acd.csv") 
actual_outcomes = pd.merge(actual_outcomes, country_metadata[['gwcode', 'alpha3']], how='left', on="gwcode")
actual_outcomes = actual_outcomes.drop_duplicates()

import country_converter as coco
cc = coco.CountryConverter()
actual_outcomes['alpha3'] = cc.pandas_convert(series=actual_outcomes.gwcode, to='ISO3', src='gwcode')

# THESE FOUR COUNTRIES WE EXCLUDE FROM OUR ANALYSIS 
print('For these NOT FOUND countries, the gwcodes are not valid, we exclude them from analysis')
# Filter those countries from outcome data
actual_outcomes = actual_outcomes[actual_outcomes.alpha3 != 'not found']

816 not found in GWcode
751 not found in GWcode
678 not found in GWcode
345 not found in GWcode


For these NOT FOUND countries, the gwcodes are not valid, we exclude them from analysis


## Load VDEM v13 Core Data and Select Vars

In [66]:
# vdem core data v13
vdem_core = pd.read_csv("../data/vdem_core_v13/vdem_core_v13.csv")

# select country vars and the five high-level dem indicators
core_indicators = ['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem']
vdem_core = vdem_core[['country_name', 
           'country_text_id', 
           'country_id', 
           'year'] + core_indicators]

vdem_core

Unnamed: 0,country_name,country_text_id,country_id,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem
0,Mexico,MEX,3,1789,0.028,0.042,0.006,,
1,Mexico,MEX,3,1790,0.028,0.042,0.006,,
2,Mexico,MEX,3,1791,0.028,0.042,0.006,,
3,Mexico,MEX,3,1792,0.028,0.042,0.006,,
4,Mexico,MEX,3,1793,0.028,0.042,0.006,,
...,...,...,...,...,...,...,...,...,...
27550,Piedmont-Sardinia,SPD,373,1857,0.207,0.135,0.065,,
27551,Piedmont-Sardinia,SPD,373,1858,0.210,0.136,0.065,,
27552,Piedmont-Sardinia,SPD,373,1859,0.210,0.136,0.065,,
27553,Piedmont-Sardinia,SPD,373,1860,0.213,0.137,0.064,,


## Inflate actual_outcome dataframe with all four possibilities of conflict 'profiles' per year and country

In [67]:
for col in ['year', 'gwcode', 'minor_actual', 'major_actual']:
    actual_outcomes[col] = actual_outcomes[col].astype('category')

actual_outcomes = actual_outcomes.drop("gwcode", axis=1)
# Get conflict profiles per country and year from actual outcome data
# Step 0: Get indicator per country and year for which conflicts have happened
actual_outcomes = actual_outcomes.groupby(['year', 'alpha3', 'minor_actual', 'major_actual'], observed=False).count()[['intensity_level']].rename(columns={'intensity_level': 'true_false'}).reset_index()
# Step 1 & 2: Identify groups where all 'true_false' values are 0, hence there was no conflict in a given year
groups_with_all_false = actual_outcomes.groupby(['alpha3', 'year'], observed=False)[['true_false']].sum()
# Creating a list of index values for rows that meet the criteria for updating
index_to_update = groups_with_all_false[groups_with_all_false.true_false == 0].index
index_to_retain = groups_with_all_false[groups_with_all_false.true_false > 0].index
# Updating 'true_false' column in the original dataframe for the identified rows
# actual_outcomes.loc[index_to_update, 'true_false'] = 1


actual_outcomes

Unnamed: 0,year,alpha3,minor_actual,major_actual,true_false
0,1946,AFG,0.0,0.0,0
1,1946,AFG,0.0,1.0,0
2,1946,AFG,1.0,0.0,0
3,1946,AFG,1.0,1.0,0
4,1946,AGO,0.0,0.0,0
...,...,...,...,...,...
34159,2018,ZAF,1.0,1.0,0
34160,2018,ZWE,0.0,0.0,0
34161,2018,ZWE,0.0,1.0,0
34162,2018,ZWE,1.0,0.0,0


In [68]:
# Define a function to apply to each group
def check_and_update(group):
    # Check if there is any row with true_false == 1
    if (group['true_false'] == 1).any():
        return group  # If yes, return the group unchanged
    
    # If no row with true_false == 1, find the row with major_actual and minor_actual == 0 and update true_false
    condition = (group['major_actual'] == 0) & (group['minor_actual'] == 0)
    if condition.any():
        group.loc[condition, 'true_false'] = 1
    
    return group

# Group by year and alpha3, then apply the function
actual_outcomes = actual_outcomes.groupby(['year', 'alpha3'], observed=False).apply(check_and_update).reset_index(drop=True)

# Check if the operation was successful by displaying the first few rows
actual_outcomes

  actual_outcomes = actual_outcomes.groupby(['year', 'alpha3'], observed=False).apply(check_and_update).reset_index(drop=True)


Unnamed: 0,year,alpha3,minor_actual,major_actual,true_false
0,1946,AFG,0.0,0.0,1
1,1946,AFG,0.0,1.0,0
2,1946,AFG,1.0,0.0,0
3,1946,AFG,1.0,1.0,0
4,1946,AGO,0.0,0.0,1
...,...,...,...,...,...
34159,2018,ZAF,1.0,1.0,0
34160,2018,ZWE,0.0,0.0,1
34161,2018,ZWE,0.0,1.0,0
34162,2018,ZWE,1.0,0.0,0


## Merge actual outcome data with vdem

In [69]:
data_merged = pd.merge(actual_outcomes, 
        vdem_core[['country_text_id', 'year'] + core_indicators], 
        right_on=['country_text_id', 'year'], 
        left_on=['alpha3', 'year'])


data_merged = data_merged[(data_merged['year'] >= 1992) & (data_merged['year'] <= 2009)].reset_index(drop=True)
data_merged

Unnamed: 0,year,alpha3,minor_actual,major_actual,true_false,country_text_id,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem
0,1992,AFG,0.0,0.0,0,AFG,0.094,0.038,0.030,0.034,0.067
1,1992,AFG,0.0,1.0,1,AFG,0.094,0.038,0.030,0.034,0.067
2,1992,AFG,1.0,0.0,0,AFG,0.094,0.038,0.030,0.034,0.067
3,1992,AFG,1.0,1.0,0,AFG,0.094,0.038,0.030,0.034,0.067
4,1992,AGO,0.0,0.0,0,AGO,0.150,0.077,0.063,0.055,0.068
...,...,...,...,...,...,...,...,...,...,...,...
8203,2009,ZAF,1.0,1.0,0,ZAF,0.774,0.654,0.519,0.672,0.527
8204,2009,ZWE,0.0,0.0,1,ZWE,0.250,0.181,0.202,0.213,0.157
8205,2009,ZWE,0.0,1.0,0,ZWE,0.250,0.181,0.202,0.213,0.157
8206,2009,ZWE,1.0,0.0,0,ZWE,0.250,0.181,0.202,0.213,0.157


In [70]:
# check if there are any NA values
data_merged.isna().any()

year               False
alpha3             False
minor_actual       False
major_actual       False
true_false         False
country_text_id    False
v2x_polyarchy      False
v2x_libdem         False
v2x_partipdem      False
v2x_delibdem       False
v2x_egaldem        False
dtype: bool

In [71]:
# Binning for each column in the DataFrame
n_bins = 3
for column in core_indicators:
    data_merged[f'{column}_b'] = pd.cut(data_merged[column], bins=n_bins, labels=False)
core_indicators_b = [i + '_b' for i in core_indicators]

for col in core_indicators_b:
    data_merged[col] = data_merged[col].astype('category')
data_merged

Unnamed: 0,year,alpha3,minor_actual,major_actual,true_false,country_text_id,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,v2x_polyarchy_b,v2x_libdem_b,v2x_partipdem_b,v2x_delibdem_b,v2x_egaldem_b
0,1992,AFG,0.0,0.0,0,AFG,0.094,0.038,0.030,0.034,0.067,0,0,0,0,0
1,1992,AFG,0.0,1.0,1,AFG,0.094,0.038,0.030,0.034,0.067,0,0,0,0,0
2,1992,AFG,1.0,0.0,0,AFG,0.094,0.038,0.030,0.034,0.067,0,0,0,0,0
3,1992,AFG,1.0,1.0,0,AFG,0.094,0.038,0.030,0.034,0.067,0,0,0,0,0
4,1992,AGO,0.0,0.0,0,AGO,0.150,0.077,0.063,0.055,0.068,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8203,2009,ZAF,1.0,1.0,0,ZAF,0.774,0.654,0.519,0.672,0.527,2,2,2,2,1
8204,2009,ZWE,0.0,0.0,1,ZWE,0.250,0.181,0.202,0.213,0.157,0,0,0,0,0
8205,2009,ZWE,0.0,1.0,0,ZWE,0.250,0.181,0.202,0.213,0.157,0,0,0,0,0
8206,2009,ZWE,1.0,0.0,0,ZWE,0.250,0.181,0.202,0.213,0.157,0,0,0,0,0


In [80]:
# Get UN region feature
data_merged['un_region'] = cc.pandas_convert(series=data_merged.alpha3, to='UNREGION', src='ISO3')
data_merged['un_region'] = data_merged['un_region'].astype('category')
data_merged.isna().any()

year               False
alpha3             False
minor_actual       False
major_actual       False
true_false         False
country_text_id    False
v2x_polyarchy      False
v2x_libdem         False
v2x_partipdem      False
v2x_delibdem       False
v2x_egaldem        False
v2x_polyarchy_b    False
v2x_libdem_b       False
v2x_partipdem_b    False
v2x_delibdem_b     False
v2x_egaldem_b      False
un_region          False
dtype: bool

In [116]:
def get_baseline_predictions(year_lower, year_upper, indicator):
    assert indicator in core_indicators_b

    # Filter year range
    data_years = data_merged[(data_merged['year'] >= year_lower) & (data_merged['year'] <= year_upper)].reset_index(drop=True)

    # Sum 'true_false' within each 'un_region', indicator, and 'minor_actual' group
    grouped_sum_minor = data_years.groupby(['un_region', indicator, 'minor_actual'], observed=True)['true_false'].sum().reset_index()
    # Calculate the total sum of 'true_false' for each 'un_region' and indicator combination, regardless of 'minor_actual'
    total_sum = grouped_sum_minor.groupby(['un_region', indicator], observed=True)['true_false'].transform('sum')
    # Calculate the fraction of 'true_false' for each 'minor_actual' category within each group
    grouped_sum_minor['prob_minor'] = grouped_sum_minor['true_false'] / total_sum
    grouped_sum_minor['prob_minor'] = grouped_sum_minor['prob_minor'].fillna(0.0)
    grouped_sum_minor = grouped_sum_minor.rename(columns={'true_false': 'minor_count_country_years'})

    # Sum 'true_false' within each 'un_region', indicator, and 'minor_actual' group
    grouped_sum_major = data_years.groupby(['un_region', indicator, 'major_actual'], observed=True)['true_false'].sum().reset_index()
    # Calculate the total sum of 'true_false' for each 'un_region' and indicator combination, regardless of 'minor_actual'
    total_sum = grouped_sum_major.groupby(['un_region', indicator], observed=True)['true_false'].transform('sum')
    # Calculate the fraction of 'true_false' for each 'minor_actual' category within each group
    grouped_sum_major['prob_major'] = grouped_sum_major['true_false'] / total_sum
    grouped_sum_major['prob_major'] = grouped_sum_major['prob_major'].fillna(0.0)
    grouped_sum_major = grouped_sum_major.rename(columns={'true_false': 'major_count_country_years'})

    return pd.merge(grouped_sum_minor, grouped_sum_major, how='left', left_on=['un_region', indicator, 'minor_actual'], right_on=['un_region', indicator, 'major_actual'])

predictions = get_baseline_predictions(1992, 2000, 'v2x_libdem_b')

predictions


Unnamed: 0,un_region,v2x_libdem_b,minor_actual,minor_count_country_years,prob_minor,major_actual,major_count_country_years,prob_major
0,Australia and New Zealand,2,0.0,9,1.000000,0.0,9,1.000000
1,Australia and New Zealand,2,1.0,0,0.000000,1.0,0,0.000000
2,Caribbean,0,0.0,23,1.000000,0.0,23,1.000000
3,Caribbean,0,1.0,0,0.000000,1.0,0,0.000000
4,Caribbean,1,0.0,4,1.000000,0.0,4,1.000000
...,...,...,...,...,...,...,...,...
79,Western Asia,1,1.0,1,0.111111,1.0,8,0.888889
80,Western Asia,2,0.0,9,0.500000,0.0,18,1.000000
81,Western Asia,2,1.0,9,0.500000,1.0,0,0.000000
82,Western Europe,2,0.0,18,1.000000,0.0,18,1.000000


In [117]:
predictions.to_csv('prediction_baseline.csv', index=False)