# UCDP_GED data preprocessing - Conflict Analysis and Data Preparation for Wider Study Area

The present notebook preprocesses the conflict data to obtain monthly conflict variables on an admin-1 level and for each ethnic group in the study area. Admin-1 levels for Kenya, Ethiopia, South Sudan (+its former part of Sudan), Uganda and Somalia is preprocessed, as in the final part of the thesis a wider study scope was required.

## Settings

In [None]:
#required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import geopandas as gpd
import matplotlib as mpl
from cycler import cycler
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
import math

In [None]:
#required paths
path_conflict_data = #path to directory with conflict data
path_admin_data = #path to admin boundaries data
path_figs = #path to figures

In [None]:
#Figure style
tw_pt = 483.69
tw_inch = tw_pt/72.27
mpl.rc('figure', titlesize=11, labelsize=10, figsize=[tw_inch,tw_inch*(5**.5 - 1) / 2])
mpl.rc('axes', titlesize=11, labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)
mpl.rc('legend', fontsize=8)
colour_rng=['teal', 'salmon', 'lightblue', 'silver', 'darkseagreen', 'palevioletred', 'slategray']
mpl.rcParams['axes.prop_cycle'] = cycler(color=colour_rng)

## Import Data

In [None]:
#import UCDP-GED data and nonstate data to inform on type of nonstate conflict
df_ucdp_ged = pd.read_csv(os.path.join(path_conflict_data, 'raw','GEDEvent_v22_1.csv'), parse_dates=['date_start', 'date_end'])
df_com_organization = pd.read_csv(os.path.join(path_conflict_data, 'raw','ucdp-nonstate-221.csv')).set_index('dyad_id')

In [None]:
#join dataframes on the respective dyad to add information to UCDP-GED on the type of nonstate conflict
df_ucdp_ged_comorg = df_ucdp_ged.join(df_com_organization, on=['dyad_new_id'], rsuffix='com')

## Filtering Data

While the Kenyan administrative units have changed over time from 2nd level to 1st level administrative units for those which are statically included in the GADM dataset, in the remaining region the administrative units which are mentioned in the UCDP dataset to have seen non-state conflict are still the same as in the GADM dataset. This is even the case for South Sudan, which has become independent from Sudan. However, administrative borders have not changed. For Uganda there is a variation in what is cocomidered the 1st level administrative unit between UCDP-GED and GADM. However, as UCDP-GED defines the adm-1 level at the finer level of districts, conflict events with a sufficient precision code are also certain to lie within the GADM adm1-level units. Therefore, the spatial precision filtering can be restricted here to a condition of a precision code of 4 or lower. The temporal filtering remaicom the same. Only for Kenya the additional condition of a higher spatial precision code prior to March 2013 is applied.

In [None]:
#determine scope of retrieval
year_start = 2004
adm_0 = ['Kenya', 'Ethiopia', 'South Sudan', 'Sudan', 'Somalia', 'Uganda']

In [None]:
#filter dataframe according to these characteristics
df_HoA = df_ucdp_ged_comorg[(df_ucdp_ged_comorg.country.isin(adm_0)) & (df_ucdp_ged_comorg.year>=year_start)]

In [None]:
#only retrieve communal conflict
df_HoA_com = df_HoA[(df_HoA.type_of_violence == 2) & (df_HoA.org == 3)]

In [None]:
#apply spatial and temporal precision requirements
df_HoA_com_satprec = df_HoA_com[(df_HoA_com.where_prec<=4) & (df_HoA_com.date_start.dt.month==df_HoA_com.date_end.dt.month) & ~ ((df_HoA_com.country=='Kenya')&(df_HoA_com.date_start<='2013-03-03')&(df_HoA_com.where_prec==4))]
df_HoA_com_ucomatprec = df_HoA_com[(df_HoA_com.where_prec>4) | (df_HoA_com.date_start.dt.month!=df_HoA_com.date_end.dt.month) |((df_HoA_com.country=='Kenya')&(df_HoA_com.date_start<='2013-03-03')&(df_HoA_com.where_prec==4))]

In [None]:
#figure of conflict records and precision
plt.bar(df_HoA_com_satprec.country.unique(),df_HoA_com_satprec.groupby("country").size(), color=colour_rng, edgecolor=colour_rng, label='satisfactory precision')
plt.bar(df_HoA_com_ucomatprec.country.unique(),bottom=df_HoA_com_satprec.groupby("country").size(), height=df_HoA_com_ucomatprec.groupby("country").size(), color='white', edgecolor=colour_rng, hatch='////', label='ucomatisfactory precision')
plt.title("Communal conflict in different Horn of Africa countries")
plt.xlabel('country')
plt.ylabel('number of records')
plt.legend()

## Clipping data

Data is clipped to GADM 3.6 admin-1.

In [None]:
gadm_countries = ['KEN', 'SSD', 'SOM', 'UGA'] #countries except for Ethiopia because of different way the GADM information is stored for Ethiopia

In [None]:
#transform communal conflict information into georeferenced points
gdf_HoA_com_satprec = gpd.GeoDataFrame(df_HoA_com_satprec, geometry=gpd.points_from_xy(df_HoA_com_satprec.longitude,df_HoA_com_satprec.latitude), crs='EPSG:4326') 

In [None]:
#loop through all countries and admin-1 levels to clip all conflict events of sufficient precision to the respective admin-1 level
for country in gadm_countries:
    adm_bound = gpd.read_file(os.path.join(path_admin_data, r'raw\GADM3.6\%s'%(country), 'gadm36_%s.gpkg'%(country)))
    adm1_bound = adm_bound.dissolve(by='NAME_1').loc[:, ['GID_1', 'geometry']] #dissolve GADM information to shapefile of admin-1 unit
    for county in adm1_bound.index:
        # clip conflict events to shapefile
        gdf_county_com_satprec = gpd.clip(gdf_HoA_com_satprec, adm1_bound.loc[[county]])
        #save county information and GID_1 to conflict dataframe
        gdf_HoA_com_satprec.loc[gdf_county_com_satprec.index, ['county', 'GID_1']] = [county, adm1_bound.loc[county, 'GID_1'][:-2]]
        if len(gdf_county_com_satprec)>0:
            #save county-specific dataframe
            gdf_county_com_satprec.to_file(os.path.join(path_conflict_data, 'adm_com_conflict', f'%s_%s_com_conflict.gpkg'%(country, county)))

In [None]:
country = 'ETH' #for Ethiopia level 1 polygocom missing in gpkg file
adm1_bound = gpd.read_file(os.path.join(path_admin_data, r'raw\GADM3.6\%s'%(country), 'gadm36_%s_1.shp'%(country))).loc[:, ['NAME_1', 'GID_1', 'geometry']].set_index('NAME_1')
for county in adm1_bound.index:
    #clip conflict events to shapefile
    gdf_county_com_satprec = gpd.clip(gdf_HoA_com_satprec, adm1_bound.loc[[county]])
    #save county information and GID_1 to conflict dataframe
    gdf_HoA_com_satprec.loc[gdf_county_com_satprec.index, ['county', 'GID_1']] = [county, adm1_bound.loc[county, 'GID_1'][:-2]]
    if len(gdf_county_com_satprec)>0:
        #save county-specific dataframe
        gdf_county_com_satprec.to_file(os.path.join(path_conflict_data, 'adm_com_conflict', f'%s_%s_com_conflict.gpkg'%(country, county)))

## Conflict count and boolean at monthly time scales per administrative unit

In [None]:
#list of all counties in the entire dataframe
counties = gdf_HoA_com_satprec.county.unique()
counties = [x for x in counties if str(x) != 'nan']

#monthly eventcount per administrative unit
for i in range(0,len(counties)):
    
    #group conflict events by month and count them
    county=counties[i]
    countyspec_df_com_county_eventcount = gdf_HoA_com_satprec[gdf_HoA_com_satprec.county==county].groupby([pd.Grouper(key='date_start', freq='M'), 'county', 'GID_1']).size().unstack(['county', 'GID_1'])
    
    #fill eventcounts to first to last month of time period of analysis (2004-2021)
    first_months = pd.DataFrame(index=pd.date_range('2004-01-31', str(countyspec_df_com_county_eventcount.index[0] - relativedelta(days=1)), freq='M'), columns=countyspec_df_com_county_eventcount.columns)
    last_months = pd.DataFrame(index=pd.date_range(str(countyspec_df_com_county_eventcount.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M'), columns=countyspec_df_com_county_eventcount.columns)
    countyspec_df_com_county_eventcount = pd.concat([first_months, countyspec_df_com_county_eventcount, last_months])
    
    #store as frequency
    countyspec_df_com_county_eventcount = countyspec_df_com_county_eventcount.resample('M').asfreq().fillna(0).stack(['county', 'GID_1'])
    
    #combine dataframes for all administrative units
    if i == 0:
        df_com_county_eventcount = countyspec_df_com_county_eventcount.copy()
    else:
        df_com_county_eventcount = pd.concat([df_com_county_eventcount, countyspec_df_com_county_eventcount])
        
#derive conflict/no conflict dataframe from monthly eventcount
df_com_county_eventbool = (df_com_county_eventcount > 0)

#save dataframes
df_com_county_eventcount.to_csv(os.path.join(path_conflict_data, 'monthly_adm_com_conflict', 'HoA_eventcount.csv'))
df_com_county_eventbool.to_csv(os.path.join(path_conflict_data, 'monthly_adm_com_conflict', 'HoA_eventbool.csv'))

## Retrieval of dyads in study area

In [None]:
#retrieve data for study area in North-Western Kenya
counties = ['Turkana', 'Marsabit', 'West Pokot']
gdf_SA_com_satprec = gdf_HoA_com_satprec[gdf_HoA_com_satprec.county.isin(counties)]

In [None]:
#retrieve all dyads in study area
dyads = gdf_SA_com_satprec.dyad_name.unique()

#loop through dyads
for i in np.arange(len(dyads)):
    dyad = dyads[i]
    
    #count monthly events per dyad
    dyadspec_df_com_dyad_eventcount = gdf_SA_com_satprec[gdf_SA_com_satprec.dyad_name==dyad].groupby([pd.Grouper(key='date_start', freq='M'), 'dyad_name']).size().unstack(['dyad_name'])
    
     #fill with first and last months and save to frequency
    first_months = pd.DataFrame(index=pd.date_range('2004-01-31', str(dyadspec_df_com_dyad_eventcount.index[0] - relativedelta(days=1)), freq='M'), columns=dyadspec_df_com_dyad_eventcount.columns)
    last_months = pd.DataFrame(index=pd.date_range(str(dyadspec_df_com_dyad_eventcount.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M'), columns=dyadspec_df_com_dyad_eventcount.columns)
    
    dyadspec_df_com_dyad_eventcount = pd.concat([first_months, dyadspec_df_com_dyad_eventcount, last_months])
    dyadspec_df_com_dyad_eventcount = dyadspec_df_com_dyad_eventcount.resample('M').asfreq().fillna(0).stack(['dyad_name'])
    
    #combine dataframes for all dyads  
    if i == 0:
        df_com_dyad_eventcount = dyadspec_df_com_dyad_eventcount.copy()
    else:
        df_com_dyad_eventcount = pd.concat([df_com_dyad_eventcount, dyadspec_df_com_dyad_eventcount])

#derive conflict/no conflic dataframe from monthly eventcount
df_com_dyad_eventbool = (df_com_dyad_eventcount > 0)

#save dataframes
df_com_dyad_eventcount.to_csv(os.path.join(path_data, 'monthly_dyad_com_conflict', 'SA_eventcount.csv'))
df_com_dyad_eventbool.to_csv(os.path.join(path_data, 'monthly_dyad_com_conflict', 'SA_eventbool.csv'))

In [None]:
#plot of monthly eventcount per dyad
df_com_dyad_eventcount = df_com_dyad_eventcount.unstack(level=1)
i = 1
plt.figure(figsize=[tw_inch, tw_inch*1.4])
for dyad in dyads:
    plt.subplot(math.ceil(len(dyads)/2), 2,i)
    plt.plot(df_com_dyad_eventcount[dyad])
    plt.title(dyad)
    plt.xlabel('year')
    plt.ylabel('event count')
    i+=1
plt.suptitle('Number of events per month for each dyad', y=0.999)
plt.tight_layout()
plt.savefig(os.path.join(path_figs, 'eventcount_dyad.svg'))

In [None]:
#plot of monthly binary conflict variable per dyad 
df_com_dyad_eventbool = df_com_dyad_eventbool.unstack(level=1)
i = 1
plt.figure(figsize=[tw_inch, tw_inch*1.4])
for dyad in dyads:
    if i==1:
        ax = plt.subplot(len(dyads), 1, i)#math.ceil(len(dyads)/2), 2,i)
    else:
        plt.subplot(len(dyads), 1, i, sharex=ax)
    plt.plot(df_com_dyad_eventbool[dyad],label=dyad)
    plt.legend()
    plt.ylabel('conflict')
    plt.yticks(ticks=[0,1])
    if i != len(dyads):
        plt.tick_params('x', labelbottom=False)
    i+=1
plt.suptitle('Binary conflict variable per month for each dyad', y=0.999)
plt.tight_layout()
plt.savefig(os.path.join(path_figs, 'eventbool_dyad.svg'))

In [None]:
print("The dyads of interest are: Dassanetch - Turkana, Toposa - Turkana, Pokot - Turkana, Borana - Gabra") #all dyads with conflict events over most of the time 

## Conflict count and boolean per ethnic group

In [None]:
def df_to_grouped_ts(df, ct, freq):
    """
    The function tracomforms a given DataFrame or GeoDataFrame into an eventcount dataset per temporal unit given as freq.
    As within this code we are only interested in non-state conflict, the conflict types can be further specified as an 
    array ct by giving the corresponding numeric code.
    """
    df_ct = df[df.type_of_violence.isin(ct)]
    df_ts = df_ct.groupby(df_ct.date_start.dt.to_period(freq)).size()
    df_ts = df_ts.resample(freq, convention='end').asfreq().fillna(0)
    return df_ts

In [None]:
eth_groups = ["Toposa", "Dassanetch", "Pokot", "Borana", "Gabra", "Turkana"] #ethnic groups of interest with a sufficient number of conflict events over time period of analysis

In [None]:
#count conflict events in UCDP-GED dataset per ethnic group

#DataFrame storing the percentage of data kept
perc_sat = pd.DataFrame(index=['Fraction of data kept'])

#Dataframes for conflict events and conflict event counts for each ethnic group
df_eth_eventcount = pd.DataFrame()
df_eth = pd.DataFrame()
for eth in eth_groups:
    
    #retrieve all records for an ethnic group after 2004
    df_conflict_eth = df_ucdp_ged[((df_ucdp_ged.side_a == eth) | (df_ucdp_ged.side_b == eth)) & (df_ucdp_ged.year>=year_start)]
    
    #filtering data --> temporal precision requirement the same as for administrative unit
    df_conflict_eth_sat = df_conflict_eth[(df_conflict_eth.date_start.dt.month == df_conflict_eth.date_end.dt.month)]
    
    #percentage of data that is of satisfactory tempporal precision
    perc_sat[eth] = len(df_conflict_eth_sat)/len(df_conflict_eth)
    
    #combine dataframes for different ethnic groups
    if i == 0:
        df_eth = df_conflict_eth_sat.copy()
    else:
        df_eth = pd.concat([df_eth, df_conflict_eth_sat])
    
    
    #grouping by month --> eventcount per ethnic group
    df_eth_eventcount[eth] = df_to_grouped_ts(df_conflict_eth_sat, ct=[2], freq='M')
    i+=1

In [None]:
#save conflict records for different ethnic groups
df_eth.to_csv(os.path.join(path_home, r'02_Data\04_Conflict_Data\ethgroup_com_conflict', 'ethgroup_com_conflict.csv' ))

In [None]:
#adjusting time period to January 2004 - December 2021 --> fill conflict count dataset to include all months from 2004 - 2021
df_eth_eventcount = df_eth_eventcount.to_timestamp(freq='M', how='end')
first_months = pd.DataFrame(index=pd.date_range('2004-01-31', str(df_eth_eventcount.index[0] - relativedelta(months=1)), freq='M').date, columns=df_eth_eventcount.columns)
last_months = pd.DataFrame(index=pd.date_range(str(df_eth_eventcount.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M').date, columns=df_eth_eventcount.columns)
df_eth_eventcount = pd.concat([first_months, df_eth_eventcount, last_months])

In [None]:
#save conflict count for ethnic groups
df_eth_eventcount = df_eth_eventcount.fillna(0).stack()
df_eth_eventcount.to_csv(os.path.join(path_data, "monthly_ethgroup_com_conflict", "eventcount.csv"))

In [None]:
#derive conflict/no conflict variable and save
df_eth_eventbool = df_eth_eventcount > 0
df_eth_eventbool.to_csv(os.path.join(path_data, "monthly_ethgroup_com_conflict", "eventbool.csv"))