In [17]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
#Importing Data
data = pd.read_csv("data/ACLED2021-2024.csv")

38130 rows × 32 columns - Original Dataset Dimensions

In [3]:
#Reformatting / Cleaning

data.drop(columns=['time_precision', 'assoc_actor_1', 'assoc_actor_2', 'iso', 'region', 'admin3', 'location', 
                  'latitude', 'longitude', 'geo_precision', 'source_scale', 'timestamp', 'tags', 'population_best', 'event_id_cnty'], 
          inplace=True)

data['event_date'] = pd.to_datetime(data['event_date'], errors = 'coerce') #changing to datetime
data = data.dropna(subset=['event_date'])

#Addressing NA values - Categorical, NUmerical and date
categorical_columns = ['disorder_type', 'event_type', 'sub_event_type', 'actor1', 'actor2', 'civilian_targeting', 
                       'country', 'admin1', 'admin2', 'source', 'notes']

for column in categorical_columns:
    data[column] = data[column].fillna('Not specified')
    
numerical_columns = ['fatalities', 'inter1', 'inter2', 'interaction']

for column in numerical_columns:
    data[column] = data[column].fillna(data[column].median()) #using median to fill
    
print(data.isna().sum())
#data.loc[:, 'actor2'] = data['actor2'].fillna('Not specified')
#data.loc[:, 'civilian_targeting'] = data['civilian_targeting'].fillna('Not specified')

event_date            0
year                  0
disorder_type         0
event_type            0
sub_event_type        0
actor1                0
inter1                0
actor2                0
inter2                0
interaction           0
civilian_targeting    0
country               0
admin1                0
admin2                0
source                0
notes                 0
fatalities            0
dtype: int64


0 Missing Values within Dataset :)

In [4]:
# Dropping Dupes
initial_row_count = data.shape[0]
data = data.drop_duplicates()
final_row_count = data.shape[0]
print(f"Removed {initial_row_count - final_row_count} duplicates")

Removed 40 duplicates


38090 rows × 17 columns - After cleaning

In [5]:
data

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,inter1,actor2,inter2,interaction,civilian_targeting,country,admin1,admin2,source,notes,fatalities
0,2024-04-19,2024,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Hamas Movement,3,Not specified,0,30,Not specified,Israel,HaDarom,Ashqelon,N12; Times of Israel,"Interception: On 19 April 2024, Hamas militant...",0
1,2024-04-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Israel),6,Not specified,0,60,Not specified,Israel,Haifa,Hadera,Ynet,"On 19 April 2024, thousands formed a human cha...",0
2,2024-04-19,2024,Demonstrations,Riots,Violent demonstration,Rioters (Israel),5,Rioters (Israel),5,55,Not specified,Israel,HaMerkaz,Ramla,Haaretz; Jerusalem Post; N12; Times of Israel;...,"On 19 April 2024, dozens of relatives of hosta...",0
3,2024-04-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Israel),6,Not specified,0,60,Not specified,Israel,HaMerkaz,Rehovot,Ynet,"On 19 April 2024, thousands formed a human cha...",0
4,2024-04-19,2024,Political violence,Battles,Armed clash,Hezbollah,3,Military Forces of Israel (2022-),1,13,Not specified,Israel,HaZafon,Zefat,Jerusalem Post; Ma'ariv; Times of Israel,"On 19 April 2024, Hezbollah forces in Lebanon ...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38125,2021-04-20,2021,Strategic developments,Strategic developments,Other,Military Forces of Israel (2009-2021),8,Civilians (Palestine),7,78,Not specified,Palestine,Gaza Strip,North Gaza,PLO Negotiations Affairs Department,"Other: On 20 April 2021, Israeli military forc...",0
38126,2021-04-20,2021,Political violence,Riots,Mob violence,Rioters (Israel),5,Not specified,0,50,Not specified,Palestine,West Bank,Hebron,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers uprooted 20...",0
38127,2021-04-20,2021,Demonstrations,Protests,Peaceful protest,Protesters (Palestine),6,Not specified,0,60,Not specified,Palestine,Gaza Strip,Deir El Balah,Ma'an News Agency,"On 20 April 2021, a large protest organized by...",0
38128,2021-04-20,2021,Political violence,Riots,Mob violence,Rioters (Israel),5,Civilians (Palestine),7,57,Civilian targeting,Palestine,West Bank,Ramallah and Al Bireh,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers from the Ba...",0


In [6]:
#Grouping Together Actor1 and 2
def consolidate_names(name):
    if 'Military Forces of Israel' in name:
        return 'Military Forces of Israel'
    elif 'Police Forces of Israel' in name:
        return 'Police Forces of Israel'
    elif 'Hamas Movement' in name:
        return 'Hamas Movement'
    elif 'Police Forces of Israel' in name or 'Government of Israel' in name:
        return 'Government and Police Forces of Israel'
    elif 'Police Forces of Palestine' in name or 'Government of Palestine' in name:
        return 'Government and Police Forces of Palestine'
    elif 'PIJ:' in name or 'Islamic Jihad' in name:
        return 'Palestinian Islamic Jihad'
    elif 'Hezbollah' in name:
        return 'Hezbollah'
    elif 'Al Aqsa' in name:
        return 'Al Aqsa Martyrs Brigade'
    elif 'Katibat' in name:
        return 'Katibat Groups (Palestine)'
    elif 'PFLP:' in name:
        return 'Popular Front for the Liberation of Palestine'
    elif 'DFLP:' in name:
        return 'Democratic Front for the Liberation of Palestine'
    elif 'Military Forces of Iran' in name:
        return 'Iranian Revolutionary Guard Corps'
    elif 'Islamic State' in name:
        return 'Islamic State'
#civilians
    elif 'Civilians' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  #Keeping isr and pal civilians
        else:
            return 'Civilians (International)'  # grouping others as int.
#armed groups
    elif 'Unidentified Armed Group' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Unidentified Armed Group (International)'
#military forces
    elif 'Military Forces of' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Military Forces of International Forces'
    elif 'Settlers' in name:
        return 'Settlers (Israel)'
    elif 'Protesters' in name or 'Rioters' in name:
        return name  # Retains specific categories due to their distinct contexts
    else:
        return name #'Other Groups' 

# Apply the consolidation function to both actor1 and actor2
data['actor1_grouped'] = data['actor1'].apply(consolidate_names)
data['actor2_grouped'] = data['actor2'].apply(consolidate_names)

In [7]:
actor1_counts = data['actor1_grouped'].value_counts()
actor2_counts = data['actor2_grouped'].value_counts()

def consolidate_small_groups(name, counts):
    # Check if the group count is less than 10
    if counts[name] < 10:
        # Check if "Israel" or "Palestine" is in the original name for appropriate categorization
        if 'Israel' in name:
            return 'Other (Israel)'
        elif 'Palestine' in name:
            return 'Other (Palestine)'
        else:
            return name #'Other Group'
    else:
        # Return the name if the count is 10 or more
        return name

# Apply the consolidation function to both actor1_grouped and actor2_grouped
data['actor1_grouped'] = data['actor1_grouped'].apply(lambda x: consolidate_small_groups(x, actor1_counts))
data['actor2_grouped'] = data['actor2_grouped'].apply(lambda x: consolidate_small_groups(x, actor2_counts))


# Print the new value counts to confirm re-categorization
#print(data['actor1_grouped'].value_counts())
#print(data['actor2_grouped'].value_counts())


In [8]:
data['actor1'] = data['actor1_grouped']
data['actor2'] = data['actor2_grouped']

data.drop(['actor1_grouped', 'actor2_grouped'], axis=1, inplace=True)

In [9]:
#Encoding

#One-Hot for Categoricals
categorical_cols = ['disorder_type', 'event_type', 'actor1', 'actor2', 'civilian_targeting',
                    'country', 'admin1', 'admin2']
data_encoded = pd.get_dummies(data, columns=categorical_cols)
data_encoded

#Label Encoder
label_encoders = {}

for col in ['inter1', 'inter2', 'interaction']:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data_encoded[col])
    label_encoders[col] = le  # storing the encoder

In [10]:
for col in categorical_cols:
    print(f"{col}: {data[col].nunique()} unique values")
data_encoded

disorder_type: 4 unique values
event_type: 6 unique values
actor1: 50 unique values
actor2: 33 unique values
civilian_targeting: 2 unique values
country: 2 unique values
admin1: 9 unique values
admin2: 31 unique values


Unnamed: 0,event_date,year,sub_event_type,inter1,inter2,interaction,source,notes,fatalities,disorder_type_Demonstrations,...,admin2_Ramallah and Al Bireh,admin2_Ramla,admin2_Rehovot,admin2_Salfit,admin2_Sharon,admin2_Tel Aviv,admin2_Tubas,admin2_Tulkarm,admin2_Yizreel,admin2_Zefat
0,2024-04-19,2024,Shelling/artillery/missile attack,2,0,9,N12; Times of Israel,"Interception: On 19 April 2024, Hamas militant...",0,0,...,0,0,0,0,0,0,0,0,0,0
1,2024-04-19,2024,Peaceful protest,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,1,...,0,0,0,0,0,0,0,0,0,0
2,2024-04-19,2024,Violent demonstration,4,4,17,Haaretz; Jerusalem Post; N12; Times of Israel;...,"On 19 April 2024, dozens of relatives of hosta...",0,1,...,0,1,0,0,0,0,0,0,0,0
3,2024-04-19,2024,Peaceful protest,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,1,...,0,0,1,0,0,0,0,0,0,0
4,2024-04-19,2024,Armed clash,2,1,3,Jerusalem Post; Ma'ariv; Times of Israel,"On 19 April 2024, Hezbollah forces in Lebanon ...",0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38125,2021-04-20,2021,Other,7,6,26,PLO Negotiations Affairs Department,"Other: On 20 April 2021, Israeli military forc...",0,0,...,0,0,0,0,0,0,0,0,0,0
38126,2021-04-20,2021,Mob violence,4,0,16,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers uprooted 20...",0,0,...,0,0,0,0,0,0,0,0,0,0
38127,2021-04-20,2021,Peaceful protest,5,0,21,Ma'an News Agency,"On 20 April 2021, a large protest organized by...",0,1,...,0,0,0,0,0,0,0,0,0,0
38128,2021-04-20,2021,Mob violence,4,6,19,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers from the Ba...",0,0,...,1,0,0,0,0,0,0,0,0,0


In [11]:
# Convert 'event_date' to the number of days since the start of the dataset
data_encoded['days_since_start'] = (data_encoded['event_date'] - data_encoded['event_date'].min()).dt.days

In [12]:
data_encoded['year'] = data_encoded['event_date'].dt.year
data_encoded['month'] = data_encoded['event_date'].dt.month
data_encoded['day'] = data_encoded['event_date'].dt.day
data_encoded

Unnamed: 0,event_date,year,sub_event_type,inter1,inter2,interaction,source,notes,fatalities,disorder_type_Demonstrations,...,admin2_Salfit,admin2_Sharon,admin2_Tel Aviv,admin2_Tubas,admin2_Tulkarm,admin2_Yizreel,admin2_Zefat,days_since_start,month,day
0,2024-04-19,2024,Shelling/artillery/missile attack,2,0,9,N12; Times of Israel,"Interception: On 19 April 2024, Hamas militant...",0,0,...,0,0,0,0,0,0,0,1095,4,19
1,2024-04-19,2024,Peaceful protest,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,1,...,0,0,0,0,0,0,0,1095,4,19
2,2024-04-19,2024,Violent demonstration,4,4,17,Haaretz; Jerusalem Post; N12; Times of Israel;...,"On 19 April 2024, dozens of relatives of hosta...",0,1,...,0,0,0,0,0,0,0,1095,4,19
3,2024-04-19,2024,Peaceful protest,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,1,...,0,0,0,0,0,0,0,1095,4,19
4,2024-04-19,2024,Armed clash,2,1,3,Jerusalem Post; Ma'ariv; Times of Israel,"On 19 April 2024, Hezbollah forces in Lebanon ...",0,0,...,0,0,0,0,0,0,1,1095,4,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38125,2021-04-20,2021,Other,7,6,26,PLO Negotiations Affairs Department,"Other: On 20 April 2021, Israeli military forc...",0,0,...,0,0,0,0,0,0,0,0,4,20
38126,2021-04-20,2021,Mob violence,4,0,16,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers uprooted 20...",0,0,...,0,0,0,0,0,0,0,0,4,20
38127,2021-04-20,2021,Peaceful protest,5,0,21,Ma'an News Agency,"On 20 April 2021, a large protest organized by...",0,1,...,0,0,0,0,0,0,0,0,4,20
38128,2021-04-20,2021,Mob violence,4,6,19,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers from the Ba...",0,0,...,0,0,0,0,0,0,0,0,4,20


In [14]:
data_encoded_numeric = data_encoded.drop('event_date', axis=1, inplace=True)
data_encoded_numeric = data_encoded.drop('source', axis=1, inplace=True)
data_encoded_numeric = data_encoded.drop('notes', axis=1, inplace=True)

X = data_encoded.drop('sub_event_type', axis=1)
y = data_encoded['sub_event_type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_numeric = X_train.select_dtypes(include=[np.number])
X_test_numeric = X_test.select_dtypes(include=[np.number])

# Fit a Random Forest to get feature importances
forest = RandomForestClassifier()
forest.fit(X_train_numeric, y_train)

# Get feature importances
importances = forest.feature_importances_
features = X_train_numeric.columns
indices = np.argsort(importances)[::-1]

# Print the feature rankings
print("Feature ranking:")
num_features = len(features)
for f in range(num_features):
    print(f"{f + 1}. {features[indices[f]]} ({importances[indices[f]]})")

Feature ranking:
1. event_type_Riots (0.07118375869080258)
2. interaction (0.06592534456225263)
3. disorder_type_Demonstrations (0.0625630162094425)
4. disorder_type_Political violence (0.061748472668947965)
5. event_type_Explosions/Remote violence (0.05770530710901605)
6. inter1 (0.05569358999427826)
7. event_type_Protests (0.0527333866545305)
8. event_type_Battles (0.048724588124481084)
9. inter2 (0.04389263849639899)
10. days_since_start (0.04338107490156639)
11. admin1_West Bank (0.035324967891359645)
12. event_type_Violence against civilians (0.028615786078861647)
13. actor1_Rioters (Palestine) (0.02681743045044884)
14. admin1_Gaza Strip (0.025929011561359627)
15. disorder_type_Strategic developments (0.024249176802877387)
16. day (0.023604069568377826)
17. actor1_Military Forces of Israel (0.023273923849387184)
18. actor2_Not specified (0.022308667401829196)
19. event_type_Strategic developments (0.02209249293079767)
20. civilian_targeting_Not specified (0.01737014347846997)
21. 

In [18]:
# Select only the top N important features for simplicity
top_features = ['event_type_Riots', 'disorder_type_Political violence', 'interaction', 'disorder_type_Demonstrations', 
                'event_type_Explosions/Remote violence', 'inter1', 'event_type_Battles', 'inter2', 'days_since_start',
               'event_type_Protests']

X = data_encoded[top_features]
y = data_encoded['sub_event_type']  # Make sure this is aligned with your full dataset

# Splitting the dataset into training and testing sets
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier with a focus on top features
forest_top = RandomForestClassifier()
forest_top.fit(X_train_top, y_train_top)

# Predicting and evaluating the model
y_pred_top = forest_top.predict(X_test_top)
print(classification_report(y_test_top, y_pred_top))


                                     precision    recall  f1-score   support

     Abduction/forced disappearance       0.00      0.00      0.00         3
                          Agreement       0.00      0.00      0.00         0
                   Air/drone strike       0.79      0.79      0.79       786
                        Armed clash       1.00      1.00      1.00       696
                            Arrests       0.33      0.33      0.33        18
                             Attack       0.99      0.99      0.99       400
           Change to group/activity       0.85      0.90      0.88        39
              Disrupted weapons use       0.95      0.95      0.95       197
 Excessive force against protesters       1.00      1.00      1.00        32
                            Grenade       0.55      0.46      0.50        39
       Looting/property destruction       0.72      0.72      0.72       452
                       Mob violence       1.00      1.00      1.00      200

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
