# Violents Events Data Collection and Preprocessing
### Violence Events Data
The data are collected at ACLED data definition
[ACLED](https://acleddata.com/resources/quick-guide-to-acled-data/#s11)
[data definition](https://acleddata.com/knowledge-base/codebook/)

### Import Libraries

In [3]:
import pandas as pd
import os
from pandas import read_csv
import geopandas as gpd
from geopandas import read_file
import matplotlib.pyplot as plt
%matplotlib inline

## Helper Function

In [4]:
def aggregate_violence_events(df, group_by_cols, agg_cols):
    '''
    Aggregate violence events based on user-defined group by columns and aggregated columns.

    Parameters:
    - df (pd.DataFrame): DataFrame containing violence event data.
    - group_by_cols (list): List of columns to group by.
    - agg_cols (list): List of columns to aggregate.

    Returns:
    - pd.DataFrame: DataFrame with aggregated event data.
    '''
    # Count the number of rows before aggregation
    df['no_of_events'] = 1
    
    # Include latitude and longitude in the group by columns to keep as point
    group_by_cols_with_coords = group_by_cols + ['latitude', 'longitude']
    
    # Group by specified columns and aggregate data
    aggregated_df = df.groupby(group_by_cols_with_coords)[agg_cols + ['no_of_events']].agg({'no_of_events': 'sum', 'fatalities': 'sum'}).reset_index()
    
    return aggregated_df

## Visualization Function

In [5]:
def visualize_events_trends_by_region(df, column_value, group_by_column, save_path=None):
    
    ''''
    Visualize the trends of violence events and fatalities by year and by a specified column value (region or district).

    Parameters:
    - df (pd.DataFrame): DataFrame containing event data.
    - column_value (str): Value of the specified column to filter the data.
    - group_by_column (str): Name of the column to group by (region or district).
    - save_path (str, optional): Path to save the plot. Default is None.

    Returns:
    - None
    
    '''
    # Filter data from 2008 to present
    df_filtered = df[df['year'] >= 2008]

    # Optionally filter data by specified column value
    if column_value:
        df_filtered = df_filtered[df_filtered[group_by_column] == column_value]

    # Group by year and the specified column and sum the number of events and fatalities
    events_trends = df_filtered.groupby(['year', group_by_column])[['no_of_events', 'fatalities']].sum()

    # Plot the trends
    plt.figure(figsize=(10, 6))
    events_trends['no_of_events'].plot(marker='o', linestyle='-', label='Number of Violence Events')
    events_trends['fatalities'].plot(marker='x', linestyle='-', label='Number of Fatalities')
    plt.title(f'Trends of Number of Events and Fatalities by Year in {column_value} {group_by_column.capitalize()}')
    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.legend()

    # Save the plot if save_path is provided
    if save_path:
        plt.savefig(save_path)

    plt.show()

In [6]:
def visualize_events_trends(df, save_path=None):
    
    '''
    Visualize the trends of 'no_of_events' and 'fatalities' per year.

    Parameters:
    - df (pd.DataFrame): DataFrame containing event data.
    - save_path (str, optional): Path to save the plot. Default is None.

    Returns:
    - None
    
    '''
    # Filter data from 2008 to present
    df_filtered = df[df['year'] >= 2008]

    # Group by year and sum the number of events and fatalities
    events_trends = df_filtered.groupby('year')[['no_of_events', 'fatalities']].sum()

    # Plot the trends
    plt.figure(figsize=(10, 6))
    events_trends['no_of_events'].plot(marker='o', linestyle='-', label='Number of Violence Events')
    events_trends['fatalities'].plot(marker='x', linestyle='-', label='Number of Fatalities')
    plt.title('Trends of Number of Events and Fatalities per Year')
    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.legend()

    # Save the plot if save_path is provided
    if save_path:
        plt.savefig(save_path)

    plt.show()

## Processing of Violents Events Data

In [7]:
def process_and_save_files(data, shapefile_path, save_directory, prefix):
    """
    Process the DataFrame and a shapefile, merge them, interpolate null values, and save as CSV and GeoJSON.

    Parameters:
    - data: pandas DataFrame containing the data to be merged with the shapefile.
    - shapefile_path: str, file path to the shapefile to be used.
    - save_directory: str, directory path to save the CSV and GeoJSON files.
    - prefix: str, prefix to be used for the saved files.
    """

    # Ensure the save directory exists
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    # Read the shapefile
    shapefile_gdf = gpd.read_file(shapefile_path)

    # Select only the relevant columns from the shapefile
    
    shapefile_gdf = shapefile_gdf[['province', 'district']]
    
    #shapefile_gdf = shapefile_gdf[['region', 'district']]  
    
    # Get the unique years from the DataFrame
    unique_years = data['year'].unique()

    # Create a DataFrame for all districts and years
    districts_years = pd.DataFrame([(district, year) for district in shapefile_gdf['district'] for year in unique_years], columns=['district', 'year'])

    # Merge this DataFrame with the shapefile DataFrame to include region
    expanded_gdf = districts_years.merge(shapefile_gdf, how='left', on='district')

    # Merge the expanded DataFrame with the original data
    merged_df = expanded_gdf.merge(data[['year', 'district', 'no_of_events', 'fatalities', 'latitude', 'longitude']], how='left', on=['district', 'year'])

    # Fill null values in specific columns with 0
    columns_to_fill = ['no_of_events', 'fatalities']
    for column in columns_to_fill:
        if column in merged_df.columns:
            merged_df[column] = merged_df[column].fillna(0)

    # Generate filenames based on the provided prefix
    csv_filename = f'{prefix}_violence_events.csv'
    geojson_filename = f'{prefix}_violence_events.geojson'

    # Save the merged data as a new CSV file
    csv_path = os.path.join(save_directory, csv_filename)
    merged_df.to_csv(csv_path, index=False)
    print(f"CSV file saved at: {csv_path}")

    # Save the merged DataFrame as GeoJSON
    geojson_path = os.path.join(save_directory, geojson_filename)
    merged_gdf = gpd.GeoDataFrame(merged_df, geometry=gpd.points_from_xy(merged_df.longitude, merged_df.latitude))
    merged_gdf.to_file(geojson_path, driver='GeoJSON')
    print(f"GeoJSON file saved at: {geojson_path}")

### Tanzania Violence Events Data Processing

In [8]:
#define a data directory
data_dir = 'tanzania_data/'

In [9]:
#read the data file
data = read_csv(data_dir+'violence_events_data/original/data_violence_events.csv', delimiter=',')
data.head()

Unnamed: 0,event_date,month,year,event_type,civilian_targeting,country,region,district,ward,location,latitude,longitude,fatalities
0,24 January 2024,January,2024,Protests,,Tanzania,Dar es Salaam,Ilala,Mchafukoge,Dar es Salaam,-6.8167,39.2833,0
1,24 January 2024,January,2024,Riots,Civilian targeting,Tanzania,Dar es Salaam,Ilala,Mchafukoge,Dar es Salaam,-6.8167,39.2833,0
2,06 January 2024,January,2024,Violence against civilians,Civilian targeting,Tanzania,Geita,Geita,Mtakuja,Geita,-2.8666,32.1667,1
3,31 December 2023,December,2023,Strategic developments,,Tanzania,Dar es Salaam,Ilala,Mchafukoge,Dar es Salaam,-6.8167,39.2833,0
4,03 December 2023,December,2023,Protests,,Tanzania,Manyara,Babati Urban,Babati,Babati,-4.2167,35.75,0


#### Aggegate the Number of Events & Fatalities

In [10]:
data_violence = aggregate_violence_events(data,['year','region','district'], ['fatalities'])
data_violence.head()

Unnamed: 0,year,region,district,latitude,longitude,no_of_events,fatalities
0,1997,Dar es Salaam,Ilala,-6.8167,39.2833,1,0
1,1997,Kigoma,Kigoma Urban,-4.8769,29.6267,3,6
2,1997,Kigoma,Kigoma Urban,-4.8572,29.6439,1,20
3,1998,Arusha,Ngorongoro,-3.25,35.5167,2,10
4,1998,Dar es Salaam,Ilala,-6.8167,39.2833,3,10


#### Processing the data

In [12]:
#load shapefile of districts
shapefile = data_dir + 'shapefiles/tz_districts.shp'
save_directory = data_dir+ 'violence_events_data/processed/'
prefix = 'tz'
#process_and_save_files(data_violence, shapefile, save_directory, prefix)

### Rwanda Violence Events Data Processing

In [13]:
#define a data directory
data_dir = 'rwanda_data/'

In [14]:
#read the data file
data = read_csv(data_dir+'violence_events_data/original/data_violence_events.csv', delimiter=',')
data.head()

Unnamed: 0,event_date,month,year,event_type,civilian_targeting,country,province,province2,district,sector,location,latitude,longitude,fatalities
0,29 January 2024,January,2024,Violence against civilians,Civilian targeting,Rwanda,Iburengerazuba,West,Nyabihu,Bigogwe,Kijote,-1.6243,29.4341,1
1,25 January 2024,January,2024,Strategic developments,,Rwanda,Amajyepfo,South,Huye,Ngoma,Butare,-2.5966,29.7394,0
2,16 January 2024,January,2024,Battles,,Rwanda,Iburengerazuba,West,Rubavu,Rubavu,Gafuku,-1.6613,29.2816,1
3,10 January 2024,January,2024,Violence against civilians,Civilian targeting,Rwanda,Umujyi wa Kigali,Kigali City,Nyarugenge,Nyamirambo,Nyamirambo,-1.9804,30.0489,0
4,05 December 2023,December,2023,Strategic developments,,Rwanda,Iburasirazuba,East,Kirehe,Mahama,Mahama,-2.274,30.8052,0


In [15]:
#visualize_events_trends_by_region(data_violence,'Temeke','district')

#### Aggegate the Number of Events & Fatalities

In [16]:
data_violence = aggregate_violence_events(data,['year','province','district'], ['fatalities'])
data_violence.head()

Unnamed: 0,year,province,district,latitude,longitude,no_of_events,fatalities
0,1997,Amajyaruguru,Burera,-1.5711,29.8702,2,86
1,1997,Amajyaruguru,Burera,-1.5355,29.8394,1,24
2,1997,Amajyaruguru,Burera,-1.5288,29.8855,1,2
3,1997,Amajyaruguru,Burera,-1.4405,29.7094,2,5
4,1997,Amajyaruguru,Burera,-1.435,29.7267,2,18


#### Processing The Data

In [17]:
#load shapefile of districts
shapefile = data_dir + 'shapefiles/rw_district.shp'
save_directory = data_dir+ 'violence_events_data/processed/'
prefix = 'rw'
#process_and_save_files(data_violence, shapefile, save_directory, prefix)