# UCDP_GED data preprocessing - Further conflict input variables for Random Forest

The Random Forest model requires two further input variables which measure whethere there has been any conflict in the adjacent administrative units and the time since the last communal conflict in the same region. 

## Settings

In [None]:
#load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import geopandas as gpd
import matplotlib as mpl
from cycler import cycler
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
import math

In [None]:
path_conflict = os.path.join(path_home, '02_Data', '04_Conflict_data')#path to conflict data
path_admin = os.path.join(path_home, '02_Data', '03_Admin_Boundaries')#path to administrative units
path_figs = os.path.join(path_home,'04_Figures') #path to figures

In [None]:
#Figure style
tw_pt = 483.69
tw_inch = tw_pt/72.27
mpl.rc('figure', titlesize=11, labelsize=10, figsize=[tw_inch,tw_inch*(5**.5 - 1) / 2])
mpl.rc('axes', titlesize=11, labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)
mpl.rc('legend', fontsize=8)
colour_rng=['teal', 'salmon', 'lightblue', 'silver', 'darkseagreen', 'palevioletred', 'slategray']
mpl.rcParams['axes.prop_cycle'] = cycler(color=colour_rng)

## Import Data

In [None]:
#import UCDP-GED data and nonstate data to inform on type of nonstate conflict
df_ucdp_ged = pd.read_csv(os.path.join(path_conflict, 'raw', 'GEDEvent_v22_1.csv'), parse_dates=['date_start', 'date_end'])
df_com_organization = pd.read_csv(os.path.join(path_conflict, 'raw', 'ucdp-nonstate-221.csv')).set_index('dyad_id')

## Spatial lag of conflict events of different types

In [None]:
#load shapes of administrative regions adjacent to each admmin-1 unit --> created with QGIS
adm1_adjacent = gpd.read_file(os.path.join(path_admin, 'Adm1_adjacent.gpkg'))

In [None]:
#list of all countries incl. wider study area plus adjacent countries
adm_0 = ['Kenya', 'Ethiopia', 'South Sudan', 'Sudan', 'Somalia', 'Uganda', 'Djibouti', 'Eritrea', 'Tanzania', 'DR Congo (Zaire)', 'Central African Republic', 'Rwanda']
#starting year for time period of analysis (temporal lag requires time period prior to 2004
year_start = 2003

In [None]:
#filter UCDP-GED data based on countries and time period
df_plus_neighbours = df_ucdp_ged[(df_ucdp_ged.country.isin(adm_0)) & (df_ucdp_ged.year>=year_start)]
#combine with information on type of non-state conflict
df_plus_neighbours = df_plus_neighbours.join(df_com_organization, on=['dyad_new_id'], rsuffix='com')

In [None]:
#filter by required spatial and temporal precision
df_plus_neighbours_satprec = df_plus_neighbours[(df_plus_neighbours.where_prec<=4) & (df_plus_neighbours.date_start.dt.month==df_plus_neighbours.date_end.dt.month) & ~ ((df_plus_neighbours.country=='Kenya')&(df_plus_neighbours.date_start<='2013-03-03')&(df_plus_neighbours.where_prec==4))]

In [None]:
#remaining conflict records as georeferenced locations
gdf_plus_neighbours_satprec = gpd.GeoDataFrame(gpd.GeoDataFrame(df_plus_neighbours_satprec, geometry=gpd.points_from_xy(df_plus_neighbours_satprec.longitude,df_plus_neighbours_satprec.latitude), crs='EPSG:4326'))

In [None]:
#dataframe with all communal conflict records in adjacent areas
gdf_plus_neighbours_com_satprec = gdf_plus_neighbours_satprec[(gdf_plus_neighbours_satprec.type_of_violence == 2)&(gdf_plus_neighbours_satprec.org==3)]

In [None]:
#dataframe with other conflict records in adjacent areas
gdf_plus_neighbours_oc_satprec = gdf_plus_neighbours_satprec[(gdf_plus_neighbours_satprec.type_of_violence.isin([1,3])) | (gdf_plus_neighbours_satprec.org!=3)]

In [None]:
#monthly eventcount variables for adjacent areas
# list of counties for which there has been communal conflict (compare to prior script on communal conflict preprocessing in wider SA)
counties_SA = ['Afar', 'Gambela Peoples', 'Oromia', 'Southern Nations, Nationalities and Peoples', 'Benshangul-Gumaz', 'Somali', 'Dire Dawa', 'Amhara', 'Addis Abeba', 'Harari People', 'Eastern Equatoria', 'Mandera',
 'Turkana', 'Nakuru', 'Laikipia', 'Uasin Gishu', 'Trans Nzoia', 'Kericho', 'Mombasa', 'Nairobi', 'Kisumu', 'Bungoma', 'West Pokot', 'Samburu', 'Baringo', 'Wajir', 'Isiolo', 'Narok', 'Elgeyo-Marakwet', 'Marsabit',
 'Meru', 'Tana River', 'Shabeellaha Dhexe', 'Hiiraan', 'Galguduud', 'Bay', 'Banaadir', 'Mudug', 'Gedo', 'Jubbada Hoose', 'Togdheer', 'Sool', 'Sanaag', 'Jungoli', 'Lakes', 'Unity', 'Warap', 'Upper Nile', 'Central Equatoria',
 'North Bahr-al-Ghazal', 'West Bahr-al-Ghazal', 'West Equatoria', 'Moroto', 'Katakwi', 'Kotido', 'Kapchorwa', 'Nakapiripirit', 'Lira', 'Bundibugyo']

#loop through admin-1 units of wider study area
for i in range(len(counties_SA)):
    county =counties_SA[i]
    
    #retrieve monthly adjacent conflict & count events per month
    adm1_adjacent[adm1_adjacent.NAME_1==county]
    gdf_countyneighb_com_satprec = gpd.clip(gdf_plus_neighbours_com_satprec, adm1_adjacent[adm1_adjacent.NAME_1==county])
    eventcount_countyneighb_com = gdf_countyneighb_com_satprec.groupby([pd.Grouper(key='date_start', freq='M')]).size()
    gdf_countyneighb_oc_satprec = gpd.clip(gdf_plus_neighbours_oc_satprec, adm1_adjacent[adm1_adjacent.NAME_1==county])
    eventcount_countyneighb_oc = gdf_countyneighb_oc_satprec.groupby([pd.Grouper(key='date_start', freq='M')]).size()
    
    ##filling months
    #communal conflict
    if len(eventcount_countyneighb_com) > 0:
        first_months_com = pd.DataFrame(index=pd.date_range('2003-01-31', str(eventcount_countyneighb_com.index[0] - relativedelta(days=1)), freq='M'))
        last_months_com = pd.DataFrame(index=pd.date_range(str(eventcount_countyneighb_com.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M'))    
        eventcount_countyneighb_com = pd.concat([first_months_com, eventcount_countyneighb_com, last_months_com]).rename({0: 'conflict_count'}, axis=1).fillna(0)
    #all other conflict
    if len(eventcount_countyneighb_oc) > 0:
        first_months_oc = pd.DataFrame(index=pd.date_range('2003-01-31', str(eventcount_countyneighb_oc.index[0] - relativedelta(days=1)), freq='M'))
        last_months_oc = pd.DataFrame(index=pd.date_range(str(eventcount_countyneighb_oc.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M'))
        eventcount_countyneighb_oc = pd.concat([first_months_oc, eventcount_countyneighb_oc, last_months_oc]).rename({0: 'conflict_count'}, axis=1).fillna(0)
    
    #combine communal and other conflict into one dataframe
    df_eventcount_countyneighb = pd.DataFrame()
    
    df_eventcount_countyneighb['com'] = eventcount_countyneighb_com
    
    df_eventcount_countyneighb['oc'] = eventcount_countyneighb_oc
    df_eventcount_countyneighb['county'] = county
    
    #concatenating resulting count dataframes for the different counties
    if i == 0:
        df_eventcount_neighb = df_eventcount_countyneighb.copy()
    else:
        df_eventcount_neighb = pd.concat([df_eventcount_neighb, df_eventcount_countyneighb])

In [None]:
#create binary conflict/no conflict variable for spatial lag of different variables
df_eventbool_neighb = df_eventcount_neighb
df_eventbool_neighb.loc[:, ['com', 'oc']] = df_eventbool_neighb.loc[:, ['com', 'oc']] > 0

In [None]:
#lag spatial conflict variable by one month
df_eventbool_neighb_lagged = df_eventbool_neighb.shift(1, freq='M')

In [None]:
#crop variable to 2004
df_eventbool_neighb_lagged =df_eventbool_neighb_lagged[df_eventbool_neighb_lagged.index.year>=2004]

In [None]:
#check for null values
df_eventbool_neighb_lagged.reset_index().set_index(['index', 'county']).unstack().isnull().sum().sum()

In [None]:
#store variable as RF input
df_eventbool_neighb_lagged.to_csv(os.path.join(path_conflict, 'RF_input', 'spilloverconflict_t-1.csv'))

## Time since last com conflict (input for RF Model)

In [None]:
#determine scope of retrieval
year_start = 1989
adm_0 = ['Kenya', 'Ethiopia', 'South Sudan', 'Sudan', 'Somalia', 'Uganda']

In [None]:
#filter dataframe according to these characteristics
df_HoA = df_ucdp_ged_comorg[(df_ucdp_ged_comorg.country.isin(adm_0)) & (df_ucdp_ged_comorg.year>=year_start)]

#only retrieve communal conflict
df_HoA_com = df_HoA[(df_HoA.type_of_violence == 2) & (df_HoA.org == 3)]

#apply spatial and temporal precision requirements
df_HoA_com_satprec = df_HoA_com[(df_HoA_com.where_prec<=4) & (df_HoA_com.date_start.dt.month==df_HoA_com.date_end.dt.month) & ~ ((df_HoA_com.country=='Kenya')&(df_HoA_com.date_start<='2013-03-03')&(df_HoA_com.where_prec==4))]
df_HoA_com_ucomatprec = df_HoA_com[(df_HoA_com.where_prec>4) | (df_HoA_com.date_start.dt.month!=df_HoA_com.date_end.dt.month) |((df_HoA_com.country=='Kenya')&(df_HoA_com.date_start<='2013-03-03')&(df_HoA_com.where_prec==4))]

In [None]:
#clipping data
gadm_countries = ['KEN', 'SSD', 'SOM', 'UGA'] #countries except for Ethiopia because of different way the GADM information is stored for Ethiopia

#transform communal conflict information into georeferenced points
gdf_HoA_com_satprec = gpd.GeoDataFrame(df_HoA_com_satprec, geometry=gpd.points_from_xy(df_HoA_com_satprec.longitude,df_HoA_com_satprec.latitude), crs='EPSG:4326') 

#loop through all countries and admin-1 levels to clip all conflict events of sufficient precision to the respective admin-1 level
for country in gadm_countries:
    adm_bound = gpd.read_file(os.path.join(path_admin, r'raw\GADM3.6\%s'%(country), 'gadm36_%s.gpkg'%(country)))
    adm1_bound = adm_bound.dissolve(by='NAME_1').loc[:, ['GID_1', 'geometry']] #dissolve GADM information to shapefile of admin-1 unit
    for county in adm1_bound.index:
        
        # clip conflict events to shapefile
        gdf_county_com_satprec = gpd.clip(gdf_HoA_com_satprec, adm1_bound.loc[[county]])

        #save county information and GID_1 to conflict dataframe
        gdf_HoA_com_satprec.loc[gdf_county_com_satprec.index, ['county', 'GID_1']] = [county, adm1_bound.loc[county, 'GID_1'][:-2]]

In [None]:
country = 'ETH' #for Ethiopia level 1 polygocom missing in gpkg file
adm1_bound = gpd.read_file(os.path.join(path_admin, r'raw\GADM3.6\%s'%(country), 'gadm36_%s_1.shp'%(country))).loc[:, ['NAME_1', 'GID_1', 'geometry']].set_index('NAME_1')

for county in adm1_bound.index:
    #clip conflict events to shapefile
    gdf_county_com_satprec = gpd.clip(gdf_HoA_com_satprec, adm1_bound.loc[[county]])

    #save county information and GID_1 to conflict dataframe
    gdf_HoA_com_satprec.loc[gdf_county_com_satprec.index, ['county', 'GID_1']] = [county, adm1_bound.loc[county, 'GID_1'][:-2]]

In [None]:
#Conflict count and boolean at monthly time scales per administrative unit

#list of all counties in the entire dataframe
counties = gdf_HoA_com_satprec.county.unique()
counties = [x for x in counties if str(x) != 'nan']

#monthly eventcount per administrative unit
for i in range(0,len(counties)):

    #group conflict events by month and count them
    county=counties[i]
    countyspec_df_com_county_eventcount = gdf_HoA_com_satprec[gdf_HoA_com_satprec.county==county].groupby([pd.Grouper(key='date_start', freq='M'), 'county', 'GID_1']).size().unstack(['county', 'GID_1'])

    #fill eventcounts to first to last month of time period of analysis (2004-2021)
    first_months = pd.DataFrame(index=pd.date_range('1989-01-31', str(countyspec_df_com_county_eventcount.index[0] - relativedelta(days=1)), freq='M'), columns=countyspec_df_com_county_eventcount.columns)
    last_months = pd.DataFrame(index=pd.date_range(str(countyspec_df_com_county_eventcount.index[-1] + relativedelta(months=1)), '2021-12-31', freq='M'), columns=countyspec_df_com_county_eventcount.columns)
    countyspec_df_com_county_eventcount = pd.concat([first_months, countyspec_df_com_county_eventcount, last_months])

    #store as frequency
    countyspec_df_com_county_eventcount = countyspec_df_com_county_eventcount.resample('M').asfreq().fillna(0).stack(['county', 'GID_1'])

    #combine dataframes for all administrative units
    if i == 0:
        df_com_county_eventcount = countyspec_df_com_county_eventcount.copy()

    else:
        df_com_county_eventcount = pd.concat([df_com_county_eventcount, countyspec_df_com_county_eventcount])

        

#derive conflict/no conflict dataframe from monthly eventcount
df_com_county_eventbool = (df_com_county_eventcount > 0)

In [None]:
#store as DataFrame
df_com_county_eventbool = pd.DataFrame(df_com_county_eventbool)

#unstack (single column for each county)
df_com_county_eventbool_us = df_com_county_eventbool.unstack(['county', 'GID_1'])

#initialize dataframe for time since last communal conflict
df_timesinceconflict = pd.DataFrame(np.zeros_like(df_com_county_eventbool_us, dtype=float), index=df_com_county_eventbool_us.index, columns =df_com_county_eventbool_us.columns)

In [None]:
#looop through each county and each index

for col in range(len(df_com_county_eventbool_us.columns)):
    for i in range(len(df_com_county_eventbool_us)):
        
        #assign null as long as there has been no conflict recorded before
        if df_com_county_eventbool_us.iloc[:i, col].sum()==0:
            df_timesinceconflict.iloc[i, col] = np.nan
            
            #assign one if the first conflict event has been recorded
            if df_com_county_eventbool_us.iloc[i, col] == True:
                c = 1
        #assign one if a conflict event has been recorded
        elif df_com_county_eventbool_us.iloc[i, col] == True:
            df_timesinceconflict.iloc[i, col] = c
            c = 1
        #add one if no conflict event has been recorded (but there have been conflict events in the same county before
        else:
            df_timesinceconflict.iloc[i, col] = c
            c += 1


In [None]:
#crop variable to time of analysis
df_timesinceconflict = df_timesinceconflict[df_timesinceconflict.index.year>=2004]

In [None]:
#fill null values in variable time since conflict with the maximum period of no conflict which has been recorded
df_timesinceconflict = df_timesinceconflict.fillna(df_timesinceconflict.max().max()) # filling nan values with the maximum time period -> sufficient for Random Forest application

In [None]:
#stack and store variable time since last communal conflict as a RF input variable
df_timesinceconflict.stack(['GID_1', 'county']).to_csv(os.path.join(path_conflict, 'RF_input', 'timesinceconflict.csv'))