<a href="https://colab.research.google.com/github/abroniewski/Child-Wasting-Prediction/blob/main/notebooks/acled/Data_investigation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

This notebook presents an analysis of how much of prevalence levels in child wasting can be explained using conflict data provided by ACLED

Author: Luiz Fonseca

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

## Prepare data

In [2]:
acled_data_path = "https://raw.githubusercontent.com/abroniewski/Child-Wasting-Prediction/main/data/acled/acled.csv"

acled_df = pd.read_csv(acled_data_path, parse_dates=['event_date'])  # 37782 rows

all_districts = acled_df['admin2'].unique()
# Filter the data to comply with the dates in the prevalence dataset.
# Considering the last 18 months prior to the first date in the prevalence data
acled_df = acled_df.query('event_date > "2016-01-01" and event_date < "2021-07-01"') # 12471 rows

acled_df.head(1)

Unnamed: 0,event_date,year,event_type,sub_event_type,actor1,assoc_actor_1,inter1,actor2,assoc_actor_2,inter2,...,admin1,admin2,location,latitude,longitude,source,source_scale,notes,fatalities,timestamp
3661,2021-06-30,2021,Violence against civilians,Attack,Al Shabaab,,2,Civilians (Somalia),,7,...,Middle Juba,Jilib,Kuunyo-Barrow,0.7946,43.3787,Caasimada,National,"On 30 June 2021, Al Shabaab militants shot and...",8,1625510720


In [3]:
prevalence_data_path = "https://raw.githubusercontent.com/abroniewski/Child-Wasting-Prediction/main/data/acled/prevalence.csv"
prevalence_df = pd.read_csv(prevalence_data_path, parse_dates=['date'])[['date', 'district', 'GAM Prevalence']]
prevalence_df.head(5)

Unnamed: 0,date,district,GAM Prevalence
0,2021-07-01,Adan Yabaal,0.286795
1,2021-07-01,Afgooye,0.463764
2,2021-07-01,Afmadow,0.391617
3,2021-07-01,Baardheere,0.386899
4,2021-07-01,Badhaadhe,0.405672


## Feature Engineering

### Fixing issue in previous model
The issue is: we are trying to predict the prevalence for the next semester and sometimes there are gaps between two measurements (see examples below). The way the model is right now, it is disconsidering the gaps between measurements and it might interpret that the prevalence for the next six months is X when in reality this value X is the prevalence observed 1 year later and not 6 months later.

In [4]:
# These are all the districts that don't have data for all the dates (they have gaps)
districts_missing_dates = prevalence_df.groupby('district')['date'].count().reset_index().query("date < 9")['district'].unique()
districts_missing_dates

array(['Abudwak', 'Adado', 'Afmadow', 'Afmadow/Xagar', 'Badhan', 'Baidoa',
       'Banadir', 'Baydhaba', 'Baydhaba/Bardaale',
       'Belet Weyne (Mataban)', 'Belet Xaawo', 'Belethawa', 'Burao',
       'Burco', 'Cabudwaaq', 'Cadaado', 'Ceel Barde', 'Ceel barde',
       'Laasqoray', 'Laasqoray/Badhan', 'Mogadishu', 'Rab Dhuure',
       'Saakow', 'Saakow/Salagle'], dtype=object)

In [5]:
# Plotting the district we can see that some of them have non-consecutive date. See the example of Baidoa
px.line(prevalence_df[prevalence_df.district.isin(districts_missing_dates)], x="date", y="GAM Prevalence", color='district').show()

In [6]:
# This is the way they're doing currently. They don't consider that the gap between two dates might be bigger than 6 months (see the example of Baydhaba)
df = prevalence_df.query("district == 'Baydhaba'").copy().sort_values('date')
df['next_prevalence'] = df['GAM Prevalence'].shift(-1)
increase = [False if x[1]<x[0] else True for x in list(zip(df['GAM Prevalence'], df['GAM Prevalence'][1:]))]
increase.append(False)
df['increase'] = increase
df.iloc[-1, df.columns.get_loc('increase')] = np.nan #No info on next month

In [7]:
df.query("district == 'Baydhaba'")

Unnamed: 0,date,district,GAM Prevalence,next_prevalence,increase
463,2018-07-01,Baydhaba,0.633388,0.472313,False
385,2019-01-01,Baydhaba,0.472313,0.573735,True
307,2019-07-01,Baydhaba,0.573735,0.480974,False
164,2020-07-01,Baydhaba,0.480974,,


In [8]:
# This is my proposed method
# This way, if the gap between 2 dates is bigger than 6 months than we consider that we don't know if the prevalence increased or 
# decreased in the next semester. See same example as above.
from pandas.tseries.offsets import MonthEnd

prevalence_df.sort_values('date', inplace=True)
prevalence_df['next_date'] = prevalence_df.groupby('district')['date'].shift(-1)
prevalence_df['next_prevalence'] = prevalence_df.groupby('district')['GAM Prevalence'].shift(-1)
prevalence_df['next_prevalence'] = np.where(prevalence_df['next_date'].dt.to_period('M') - prevalence_df['date'].dt.to_period('M') != MonthEnd(6),
                                     pd.NA,
                                     prevalence_df['next_prevalence']
                                     )

prevalence_df['prevalence_diff'] = np.where(
    prevalence_df['next_date'].dt.to_period('M') - prevalence_df['date'].dt.to_period('M') == MonthEnd(6),
    prevalence_df['next_prevalence'] - prevalence_df['GAM Prevalence'],
    pd.NA
)

In [9]:
prevalence_df.query("district == 'Baydhaba'")

Unnamed: 0,date,district,GAM Prevalence,next_date,next_prevalence,prevalence_diff
463,2018-07-01,Baydhaba,0.633388,2019-01-01,0.472313,-0.161075
385,2019-01-01,Baydhaba,0.472313,2019-07-01,0.573735,0.101423
307,2019-07-01,Baydhaba,0.573735,2020-07-01,,
164,2020-07-01,Baydhaba,0.480974,NaT,,


In [10]:
# don't need this column anymore
prevalence_df.drop(columns=['next_date'], inplace=True)

base_path = '/content/drive/MyDrive/Data Challenge 3/Data/'
#prevalence_df.to_csv(base_path+'prevalence_v2.csv', index=False)

### Creating new features out of conflict data

In [None]:
# Features to consider:
# number of events per type of event in those six months
# number of events per subtype in those six months
# number of events per actor in the semester
# number of events per interaction in the semester
# total number of conflicts ---- faltou
# The number of conflicts per scale
# the number of fatalities
# The previous prevalence
# The number of conflicts per number of actors (1,2,3,4)
# The encoded district name

In [11]:
# Drop columns that are not useful
acled_df.drop(columns=['year', 'latitude', 'longitude', 'source', 'notes', 'timestamp', 'inter1', 'inter2', 'location', 'admin1'], inplace=True)
acled_df.head(3)

Unnamed: 0,event_date,event_type,sub_event_type,actor1,assoc_actor_1,actor2,assoc_actor_2,interaction,admin2,source_scale,fatalities
3661,2021-06-30,Violence against civilians,Attack,Al Shabaab,,Civilians (Somalia),,27,Jilib,National,8
3662,2021-06-30,Violence against civilians,Abduction/forced disappearance,Al Shabaab,,Civilians (Somalia),,27,Jowhar,National,0
3663,2021-06-30,Violence against civilians,Attack,Unidentified Armed Group (Somalia),,Civilians (Somalia),Muslim Group (Somalia),37,Baydhaba,National,2


In [12]:
# Add the number of actors involved in the conflict
acled_df['count_actors'] = acled_df[['actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2']].count(axis=1)

# Add dummies variables per categorical variable
features_df = pd.get_dummies(acled_df, columns=['event_type', 'sub_event_type', 'actor1', 'actor2', 'assoc_actor_1', 'assoc_actor_2', 'interaction', 'source_scale', 'count_actors'])

# Add the count of conflicts to be summarized later
features_df['number_conflicts'] = 1

features_df.head(2)

Unnamed: 0,event_date,admin2,fatalities,event_type_Battles,event_type_Explosions/Remote violence,event_type_Protests,event_type_Riots,event_type_Strategic developments,event_type_Violence against civilians,sub_event_type_Abduction/forced disappearance,...,source_scale_Other-Regional,source_scale_Other-Subnational,source_scale_Regional,source_scale_Subnational,source_scale_Subnational-National,count_actors_1,count_actors_2,count_actors_3,count_actors_4,number_conflicts
3661,2021-06-30,Jilib,8,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3662,2021-06-30,Jowhar,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,0,1


### Summarizing events for each semester

In [13]:
first_day = pd.to_datetime('2017-01-01')
features_df = features_df.groupby([pd.Grouper(key="event_date", freq='6MS', origin=first_day), 'admin2']).sum().reset_index()
features_df['district_encoded'] = features_df['admin2'].astype('category').cat.codes # Add codes for each district
features_df.head()

Unnamed: 0,event_date,admin2,fatalities,event_type_Battles,event_type_Explosions/Remote violence,event_type_Protests,event_type_Riots,event_type_Strategic developments,event_type_Violence against civilians,sub_event_type_Abduction/forced disappearance,...,source_scale_Other-Subnational,source_scale_Regional,source_scale_Subnational,source_scale_Subnational-National,count_actors_1,count_actors_2,count_actors_3,count_actors_4,number_conflicts,district_encoded
0,2016-01-01,Adan Yabaal,33,7.0,0.0,0.0,0.0,5.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,4.0,0.0,13,0
1,2016-01-01,Afgooye,149,40.0,29.0,1.0,0.0,3.0,11.0,0.0,...,0.0,0.0,3.0,0.0,1.0,63.0,20.0,0.0,84,1
2,2016-01-01,Afmadow,91,11.0,7.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,17.0,2.0,0.0,20,2
3,2016-01-01,Baardheere,64,14.0,10.0,0.0,0.0,6.0,5.0,1.0,...,0.0,0.0,0.0,0.0,1.0,24.0,10.0,0.0,35,3
4,2016-01-01,Badhaadhe,39,9.0,5.0,0.0,0.0,2.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17.0,3.0,0.0,20,4


In [14]:
len(features_df)

707

In [15]:
len(prevalence_df)

677

In [16]:
from pandas.tseries.offsets import DateOffset

# merge the dataframes taking into account the dates
features_df['date_to_join'] = features_df['event_date'] +  DateOffset(months=6)

merged_df = prevalence_df.merge(features_df, left_on=['district', 'date'], right_on=['admin2', 'date_to_join'])
final_6m_df = merged_df.drop(columns=['date_to_join', 'prevalence_diff', 'event_date', 'admin2'])

# Remove the rows where the target variable is null
final_6m_df = final_6m_df[final_6m_df['next_prevalence'].notnull()]

final_6m_df.head()

Unnamed: 0,date,district,GAM Prevalence,next_prevalence,fatalities,event_type_Battles,event_type_Explosions/Remote violence,event_type_Protests,event_type_Riots,event_type_Strategic developments,...,source_scale_Other-Subnational,source_scale_Regional,source_scale_Subnational,source_scale_Subnational-National,count_actors_1,count_actors_2,count_actors_3,count_actors_4,number_conflicts,district_encoded
0,2017-07-01,Zeylac,0.3796,0.169,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2,73
1,2017-07-01,Buuhoodle,0.3406,0.2028,8,4.0,0.0,2.0,0.0,2.0,...,0.0,0.0,0.0,0.0,3.0,6.0,1.0,1.0,11,20
2,2017-07-01,Buur Hakaba,0.3588,0.2886,25,13.0,10.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,1.0,25.0,2.0,0.0,28,21
3,2017-07-01,Cadale,0.3692,0.351,24,3.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,8,24
4,2017-07-01,Caluula,0.4056,0.3432,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,25


In [None]:
len(final_6m_df)

483

### Summarizing events in 1 year

In [None]:
features_1y_df = features_df[features_df.columns.drop('district_encoded')].groupby('admin2').rolling(window='365D', on='event_date').sum().reset_index().drop(columns=['level_1'])
features_1y_df['district_encoded'] = features_1y_df['admin2'].astype('category').cat.codes # Add codes for each district
features_1y_df.head()

Unnamed: 0,admin2,event_date,actor1_AMISOM: African Union Mission in Somalia (2007-),actor1_AMISOM: African Union Mission in Somalia (2007-) (Burundi),actor1_AMISOM: African Union Mission in Somalia (2007-) (Djibouti),actor1_AMISOM: African Union Mission in Somalia (2007-) (Ethiopia),actor1_AMISOM: African Union Mission in Somalia (2007-) (Kenya),actor1_AMISOM: African Union Mission in Somalia (2007-) (Uganda),actor1_ASWJ: Ahlu Sunna Wal Jamaa,actor1_Abdalla-Arab Sub-Clan Militia (Somalia),...,sub_event_type_Non-violent transfer of territory,sub_event_type_Other,sub_event_type_Peaceful protest,sub_event_type_Protest with intervention,sub_event_type_Remote explosive/landmine/IED,sub_event_type_Sexual violence,sub_event_type_Shelling/artillery/missile attack,sub_event_type_Suicide bomb,sub_event_type_Violent demonstration,district_encoded
0,Adan Yabaal,2016-01-01,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Adan Yabaal,2019-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Adan Yabaal,2019-07-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0
3,Adan Yabaal,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0
4,Adan Yabaal,2020-07-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0


In [None]:
# merge the dataframes take into account the dates
features_1y_df['date_to_join'] = features_1y_df['event_date'] +  DateOffset(months=6)

merged_df = prevalence_df.merge(features_1y_df, left_on=['district', 'date'], right_on=['admin2', 'date_to_join'])
final_1y_df = merged_df.drop(columns=['date_to_join', 'prevalence_diff', 'event_date', 'admin2'])

# Remove the rows where the target variable is null
final_1y_df = final_1y_df[final_1y_df['next_prevalence'].notnull()]

len(final_1y_df)

483

### Summarizing events in 1.5 year

In [None]:
features_1y6m_df = features_df[features_df.columns.drop('district_encoded')].groupby('admin2').rolling(window='547D', on='event_date').sum().reset_index().drop(columns=['level_1'])
features_1y6m_df['district_encoded'] = features_1y6m_df['admin2'].astype('category').cat.codes # Add codes for each district
features_1y6m_df.head()

Unnamed: 0,admin2,event_date,actor1_AMISOM: African Union Mission in Somalia (2007-),actor1_AMISOM: African Union Mission in Somalia (2007-) (Burundi),actor1_AMISOM: African Union Mission in Somalia (2007-) (Djibouti),actor1_AMISOM: African Union Mission in Somalia (2007-) (Ethiopia),actor1_AMISOM: African Union Mission in Somalia (2007-) (Kenya),actor1_AMISOM: African Union Mission in Somalia (2007-) (Uganda),actor1_ASWJ: Ahlu Sunna Wal Jamaa,actor1_Abdalla-Arab Sub-Clan Militia (Somalia),...,sub_event_type_Non-violent transfer of territory,sub_event_type_Other,sub_event_type_Peaceful protest,sub_event_type_Protest with intervention,sub_event_type_Remote explosive/landmine/IED,sub_event_type_Sexual violence,sub_event_type_Shelling/artillery/missile attack,sub_event_type_Suicide bomb,sub_event_type_Violent demonstration,district_encoded
0,Adan Yabaal,2016-01-01,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Adan Yabaal,2019-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Adan Yabaal,2019-07-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0
3,Adan Yabaal,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0
4,Adan Yabaal,2020-07-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,0


In [None]:
# merge the dataframes take into account the dates
features_1y6m_df['date_to_join'] = features_1y6m_df['event_date'] +  DateOffset(months=6)

merged_df = prevalence_df.merge(features_1y6m_df, left_on=['district', 'date'], right_on=['admin2', 'date_to_join'])
final_1y6m_df = merged_df.drop(columns=['date_to_join', 'prevalence_diff', 'event_date', 'admin2'])

# Remove the rows where the target variable is null
final_1y6m_df = final_1y6m_df[final_1y6m_df['next_prevalence'].notnull()]

len(final_1y6m_df)

483

### Random forest as feature selector

In [None]:
# Let's use the data after July 2020 as the test set, which corresponds to approximately 25% of the data
len(final_6m_df.query("date < '2020-07-01'")), len(final_6m_df.query("date >= '2020-07-01'"))

(365, 118)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score

def train_and_evaluate(final_df):
  # Train and test split
  X_train = final_df.query("date < '2020-07-01'")[final_df.columns.drop(['next_prevalence', 'date', 'district'])]
  y_train = final_df.query("date < '2020-07-01'")['next_prevalence']
  X_test = final_df.query("date >= '2020-07-01'")[final_df.columns.drop(['next_prevalence', 'date', 'district'])]
  y_test = final_df.query("date >= '2020-07-01'")['next_prevalence']

  # Create the parameter grid. Let's use the default parameters instead
  # param_grid = {
  #     'max_depth': [10, 50, 100, 500, 1000],
  #     'n_estimators': [10, 25, 50, 100, 300]
  # }

  # # Create a based model
  # rf = RandomForestRegressor(random_state=0)
  # grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
  # grid_search.fit(X_train,y_train)

  rf = RandomForestRegressor(random_state=0)
  rf.fit(X_train,y_train)

  print("Parameters:", "max_depth:", rf.get_params()['max_depth'], "n_estimators:", rf.get_params()['n_estimators'])

  def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    mae = mean_absolute_error(test_labels, predictions)
    increase = np.where(test_labels > test_features['GAM Prevalence'], True, False)
    predicted_increase = np.where(predictions > test_features['GAM Prevalence'], True, False)
    acc = accuracy_score(increase, predicted_increase)
    print('MAE:', mae)
    print('Accuracy:', acc)

  grid_accuracy = evaluate(rf, X_test, y_test)

  return rf

In [None]:
model_6m = train_and_evaluate(final_6m_df)

Parameters: max_depth: None n_estimators: 100
MAE: 0.05040635232846461
Accuracy: 0.5847457627118644


In [None]:
model_1y = train_and_evaluate(final_1y_df)

Parameters: max_depth: None n_estimators: 100
MAE: 0.04964466829025728
Accuracy: 0.6186440677966102


In [None]:
model_1y6m = train_and_evaluate(final_1y6m_df)

Parameters: max_depth: None n_estimators: 100
MAE: 0.04629710139760963
Accuracy: 0.6779661016949152


In [None]:
# Seeing feature importance (0.5 year)
importance_6m = list(zip(model_6m.feature_names_in_,model_6m.feature_importances_))
importance_6m.sort(key = lambda x : x[1])
px.bar(importance_6m[-10:], x=1, y=0).show()

In [None]:
# Seeing feature importance (1 year)
importance_1y = list(zip(model_1y.feature_names_in_,model_1y.feature_importances_))
importance_1y.sort(key = lambda x : x[1])
px.bar(importance_1y[-10:], x=1, y=0).show()

In [None]:
# Seeing feature importance (1.5 year)
importance_1y6m = list(zip(model_1y6m.feature_names_in_,model_1y6m.feature_importances_))
importance_1y6m.sort(key = lambda x : x[1])
px.bar(importance_1y6m[-10:], x=1, y=0).show()

In [None]:
# mount drive first and change the path
base_path = '/content/drive/MyDrive/Data Challenge 3/Data/'
#final_6m_df[['date', 'next_prevalence', 'district'] + [x[0] for x in importance_6m[-10:]][::-1]].to_csv(base_path+'features_6m.csv', index=False)
#final_1y_df[['date', 'next_prevalence', 'district'] + [x[0] for x in importance_1y[-10:]][::-1]].to_csv(base_path+'features_1y.csv', index=False)
#final_1y6m_df[['date', 'next_prevalence', 'district'] + [x[0] for x in importance_1y6m[-10:]][::-1]].to_csv(base_path+'features_1y6m.csv', index=False)

## How does prevalence behave over time?

In [None]:
# GAM prevalence variation over time
px.line(prevalence_df, x="date", y="GAM Prevalence", color='district').show()

## Al shabaab attacks

In [None]:
most_important_features = [x[0] for x in importance_6m[-10:]] # ten most important
df = merged_df[['date', 'next_prevalence', 'district'] + most_important_features[::-1]]
df = df[df['next_prevalence'].notnull()]
df['next_prevalence'] = df['next_prevalence'].astype(float)
df.head(3)

Unnamed: 0,date,next_prevalence,district,GAM Prevalence,district_encoded,actor1_Al Shabaab,actor2_Al Shabaab,fatalities,interaction_28,actor2_AMISOM: African Union Mission in Somalia (2007-) (Ethiopia),interaction_27,number_conflicts,interaction_12
0,2017-07-01,0.169,Zeylac,0.3796,73,0.0,0.0,1.0,0.0,0.0,0.0,10.0,0.0
1,2017-07-01,0.2028,Buuhoodle,0.3406,20,1.0,1.0,41.0,0.0,0.0,1.0,70.0,1.0
2,2017-07-01,0.2886,Buur Hakaba,0.3588,21,8.0,35.0,104.0,13.0,0.0,7.0,69.0,21.0


In [None]:
px.scatter(df, x="actor2_Al Shabaab", y="next_prevalence").show()

In [None]:
df.corr()

Unnamed: 0,next_prevalence,GAM Prevalence,district_encoded,actor1_Al Shabaab,actor2_Al Shabaab,sub_event_type_Armed clash,actor2_Majeerteen Clan Militia (Somalia),number_conflicts,interaction_12,interaction_28,actor2_AMISOM: African Union Mission in Somalia (2007-) (Ethiopia)
next_prevalence,1.0,0.653154,0.145433,0.21181,0.150367,0.197688,-0.0153,0.18057,0.216898,0.179448,0.130138
GAM Prevalence,0.653154,1.0,0.14519,0.190452,0.119408,0.17032,-0.002609,0.155016,0.185298,0.133052,0.082912
district_encoded,0.145433,0.14519,1.0,-0.211572,-0.131934,-0.233101,0.030082,-0.24484,-0.216911,-0.128445,-0.09197
actor1_Al Shabaab,0.21181,0.190452,-0.211572,1.0,0.221684,0.891257,0.007381,0.847289,0.868357,0.355837,0.107356
actor2_Al Shabaab,0.150367,0.119408,-0.131934,0.221684,1.0,0.558226,0.047015,0.586722,0.607453,0.683594,-0.014101
sub_event_type_Armed clash,0.197688,0.17032,-0.233101,0.891257,0.558226,1.0,0.063165,0.966195,0.956299,0.455821,0.04992
actor2_Majeerteen Clan Militia (Somalia),-0.0153,-0.002609,0.030082,0.007381,0.047015,0.063165,1.0,0.060958,0.03306,0.02473,0.023791
number_conflicts,0.18057,0.155016,-0.24484,0.847289,0.586722,0.966195,0.060958,1.0,0.920744,0.421077,0.045667
interaction_12,0.216898,0.185298,-0.216911,0.868357,0.607453,0.956299,0.03306,0.920744,1.0,0.45701,0.049506
interaction_28,0.179448,0.133052,-0.128445,0.355837,0.683594,0.455821,0.02473,0.421077,0.45701,1.0,0.079622


## investigating conflict over time

In [None]:
acled_df.head(1)

Unnamed: 0,event_date,year,event_type,sub_event_type,actor1,assoc_actor_1,inter1,actor2,assoc_actor_2,inter2,...,admin1,admin2,location,latitude,longitude,source,source_scale,notes,fatalities,timestamp
3661,2021-06-30,2021,Violence against civilians,Attack,Al Shabaab,,2,Civilians (Somalia),,7,...,Middle Juba,Jilib,Kuunyo-Barrow,0.7946,43.3787,Caasimada,National,"On 30 June 2021, Al Shabaab militants shot and...",8,1625510720


In [None]:
#let's see number of fatalities per district over time
px.line(acled_df, x="event_date", y="fatalities", color='admin2').show()

In [None]:
# you can't get much information just through the visualization

In [None]:
px.bar(acled_df.groupby('year')['fatalities'].sum().reset_index(), x='year', y='fatalities').show()

In [None]:
px.bar(acled_df.groupby('event_type')['fatalities'].sum().reset_index(), x='event_type', y='fatalities').show()

In [None]:
px.bar(acled_df.groupby('sub_event_type')['fatalities'].sum().reset_index(), x='sub_event_type', y='fatalities').show()

In [None]:
px.bar(acled_df.groupby('actor1')['fatalities'].sum().reset_index().sort_values('fatalities', ascending=False).head(10), x='actor1', y='fatalities').show()

## For Ethical Essay

In [17]:
merged_df = prevalence_df.merge(features_df, left_on=['district', 'date'], right_on=['admin2', 'date_to_join'])
merged_df = merged_df[merged_df['next_prevalence'].notnull()]
len(merged_df)

483

### Districts representation in the data

In [44]:
# which districts are under-represented in the data
districts_count = merged_df.groupby('district')['district'].count().sort_values() #if you want to sort values
px.bar(districts_count).show()

In [42]:
# How many districts from all districts were dropped and aren't seen in the chart
set(all_districts) - set(districts_count.index)

set()

### Events subtype

In [76]:
#acled_df.groupby(['event_type', 'sub_event_type']).size().reset_index(name='count')
acled_df.groupby(['event_type', 'sub_event_type']).size().reset_index(name='count').sort_values(['event_type', 'count'], ascending=[True, False])

Unnamed: 0,event_type,sub_event_type,count
0,Battles,Armed clash,6461
1,Battles,Government regains territory,236
2,Battles,Non-state actor overtakes territory,140
5,Explosions/Remote violence,Remote explosive/landmine/IED,1835
3,Explosions/Remote violence,Air/drone strike,675
4,Explosions/Remote violence,Grenade,491
6,Explosions/Remote violence,Shelling/artillery/missile attack,445
7,Explosions/Remote violence,Suicide bomb,83
9,Protests,Peaceful protest,502
10,Protests,Protest with intervention,72


### Events with or without fatalities

In [86]:
# Count of events with fatalities or no fatalities
no_fatalities = acled_df.query('fatalities == 0').groupby('sub_event_type').size().reset_index(name='count')
no_fatalities['fatalities'] = 'no fatalities'
with_fatalities = acled_df.query('fatalities > 0').groupby('sub_event_type').size().reset_index(name='count')
with_fatalities['fatalities'] = 'with fatalities'

all_events = pd.concat([no_fatalities, with_fatalities])
px.bar(all_events, x='sub_event_type', y='count', color='fatalities', barmode='group').show()

## The end