In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

# Assuming 'mi_df' is already created and contains the MI scores

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.linear_model import LogisticRegression

In [2]:
#Importing Data
data = pd.read_csv("data/ACLED2021-2024.csv")


data.drop(columns=['time_precision', 'assoc_actor_1', 'assoc_actor_2', 'iso', 'region', 'admin3', 'location', 
                  'latitude', 'longitude', 'geo_precision', 'source_scale', 'timestamp', 'tags', 'population_best', 'event_id_cnty'], 
          inplace=True)
data['event_date'] = pd.to_datetime(data['event_date'], errors = 'coerce') #changing to datetime
#data = data.dropna(subset=['event_date'])

38130 rows × 32 columns - Original Dataset Dimensions

In [3]:
# Dropping Dupes
initial_row_count = data.shape[0]
data = data.drop_duplicates()
final_row_count = data.shape[0]
print(f"Removed {initial_row_count - final_row_count} duplicates")
data

Removed 40 duplicates


Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,inter1,actor2,inter2,interaction,civilian_targeting,country,admin1,admin2,source,notes,fatalities
0,2024-04-19,2024,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Hamas Movement,3,,0,30,,Israel,HaDarom,Ashqelon,N12; Times of Israel,"Interception: On 19 April 2024, Hamas militant...",0
1,2024-04-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Israel),6,,0,60,,Israel,Haifa,Hadera,Ynet,"On 19 April 2024, thousands formed a human cha...",0
2,2024-04-19,2024,Demonstrations,Riots,Violent demonstration,Rioters (Israel),5,Rioters (Israel),5,55,,Israel,HaMerkaz,Ramla,Haaretz; Jerusalem Post; N12; Times of Israel;...,"On 19 April 2024, dozens of relatives of hosta...",0
3,2024-04-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Israel),6,,0,60,,Israel,HaMerkaz,Rehovot,Ynet,"On 19 April 2024, thousands formed a human cha...",0
4,2024-04-19,2024,Political violence,Battles,Armed clash,Hezbollah,3,Military Forces of Israel (2022-),1,13,,Israel,HaZafon,Zefat,Jerusalem Post; Ma'ariv; Times of Israel,"On 19 April 2024, Hezbollah forces in Lebanon ...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38125,2021-04-20,2021,Strategic developments,Strategic developments,Other,Military Forces of Israel (2009-2021),8,Civilians (Palestine),7,78,,Palestine,Gaza Strip,North Gaza,PLO Negotiations Affairs Department,"Other: On 20 April 2021, Israeli military forc...",0
38126,2021-04-20,2021,Political violence,Riots,Mob violence,Rioters (Israel),5,,0,50,,Palestine,West Bank,Hebron,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers uprooted 20...",0
38127,2021-04-20,2021,Demonstrations,Protests,Peaceful protest,Protesters (Palestine),6,,0,60,,Palestine,Gaza Strip,Deir El Balah,Ma'an News Agency,"On 20 April 2021, a large protest organized by...",0
38128,2021-04-20,2021,Political violence,Riots,Mob violence,Rioters (Israel),5,Civilians (Palestine),7,57,Civilian targeting,Palestine,West Bank,Ramallah and Al Bireh,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers from the Ba...",0


In [4]:
#Reformatting / Cleaning

#Addressing NA values - Categorical, NUmerical and date
categorical_columns = ['disorder_type', 'event_type', 'sub_event_type', 'actor1', 'actor2', 'civilian_targeting', 
                       'country', 'admin1', 'admin2', 'source', 'notes']
categorical_columns = data[categorical_columns]


for column in categorical_columns:
    data[column] = data[column].fillna('Not specified')

numerical_columns = ['fatalities', 'inter1', 'inter2', 'interaction']

for column in numerical_columns:
    data[column] = data[column].fillna(data[column].median()) #using median to fill
    
print(data.isna().sum())
#data.loc[:, 'actor2'] = data['actor2'].fillna('Not specified')
#data.loc[:, 'civilian_targeting'] = data['civilian_targeting'].fillna('Not specified')

event_date            0
year                  0
disorder_type         0
event_type            0
sub_event_type        0
actor1                0
inter1                0
actor2                0
inter2                0
interaction           0
civilian_targeting    0
country               0
admin1                0
admin2                0
source                0
notes                 0
fatalities            0
dtype: int64


In [5]:
#Grouping Together Actor1 and 2
def consolidate_names(name):
    if 'Military Forces of Israel' in name:
        return 'Military Forces of Israel'
    elif 'Police Forces of Israel' in name:
        return 'Police Forces of Israel'
    elif 'Hamas Movement' in name:
        return 'Hamas Movement'
    elif 'Police Forces of Israel' in name or 'Government of Israel' in name:
        return 'Government and Police Forces of Israel'
    elif 'Police Forces of Palestine' in name or 'Government of Palestine' in name:
        return 'Government and Police Forces of Palestine'
    elif 'PIJ:' in name or 'Islamic Jihad' in name:
        return 'Palestinian Islamic Jihad'
    elif 'Hezbollah' in name:
        return 'Hezbollah'
    elif 'Al Aqsa' in name:
        return 'Al Aqsa Martyrs Brigade'
    elif 'Katibat' in name:
        return 'Katibat Groups (Palestine)'
    elif 'PFLP:' in name:
        return 'Popular Front for the Liberation of Palestine'
    elif 'DFLP:' in name:
        return 'Democratic Front for the Liberation of Palestine'
    elif 'Military Forces of Iran' in name:
        return 'Iranian Revolutionary Guard Corps'
    elif 'Islamic State' in name:
        return 'Islamic State'
#civilians
    elif 'Civilians' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  #Keeping isr and pal civilians
        else:
            return 'Civilians (International)'  # grouping others as int.
#armed groups
    elif 'Unidentified Armed Group' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Unidentified Armed Group (International)'
#military forces
    elif 'Military Forces of' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Military Forces of International Forces'
    elif 'Settlers' in name:
        return 'Settlers (Israel)'
    elif 'Protesters' in name or 'Rioters' in name:
        return name  # Retains specific categories due to their distinct contexts
    else:
        return name #'Other Groups' 

# Apply the consolidation function to both actor1 and actor2
data['actor1_grouped'] = data['actor1'].apply(consolidate_names)
data['actor2_grouped'] = data['actor2'].apply(consolidate_names)

In [6]:
#Grouping smaller entities
actor1_counts = data['actor1_grouped'].value_counts()
actor2_counts = data['actor2_grouped'].value_counts()

def consolidate_small_groups(name, counts): #Check if Isr or Pal if not 'name'
    if counts[name] < 10:
        if 'Israel' in name:
            return 'Other (Israel)'
        elif 'Palestine' in name:
            return 'Other (Palestine)'
        else:
            return'Other Group'
    else:
        # Return the name if the count is 10 or more
        return name

# Apply the consolidation function to both actor1_grouped and actor2_grouped
data['actor1_grouped'] = data['actor1_grouped'].apply(lambda x: consolidate_small_groups(x, actor1_counts))
data['actor2_grouped'] = data['actor2_grouped'].apply(lambda x: consolidate_small_groups(x, actor2_counts))


# Print the new value counts to confirm re-categorization
#print(data['actor1_grouped'].value_counts())
#print(data['actor2_grouped'].value_counts())

data['actor1'] = data['actor1_grouped']
data['actor2'] = data['actor2_grouped']

data.drop(['actor1_grouped', 'actor2_grouped'], axis=1, inplace=True)


In [7]:
print(data.isna().sum())


event_date            0
year                  0
disorder_type         0
event_type            0
sub_event_type        0
actor1                0
inter1                0
actor2                0
inter2                0
interaction           0
civilian_targeting    0
country               0
admin1                0
admin2                0
source                0
notes                 0
fatalities            0
dtype: int64


In [8]:
# Calculate the percentage of zero values per column
zero_counts = (data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(data)

# Display columns with high percentages of zeros
print("Percentage of zeros in each column:")
print(zero_percentage[zero_percentage > 0])  # Adjust the threshold as necessary


Percentage of zeros in each column:
inter2        27.274875
fatalities    91.047519
dtype: float64


In [9]:
#Creating Temporal Features for T-S

#Date related
data['year'] = data['event_date'].dt.year
data['month'] = data['event_date'].dt.month
data['day'] = data['event_date'].dt.day
data['day_of_week'] = data['event_date'].dt.day_name()
data['days_since_start'] = (data['event_date'] - data['event_date'].min()).dt.days

# time since last event of the same type 
data['time_since_last_event'] = data.groupby(
    'event_type')['event_date'].diff().apply(lambda x: x.days)

# time since last disorder of the same type
data['time_since_last_disorder'] = data.groupby(
    'disorder_type')['event_date'].diff().apply(lambda x: x.days)

# rolling avg for fatalities
data['rolling_avg_fatalities_7d'] = data.groupby(
    'event_type')['fatalities'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

# cumulative counts of events and fatalities by specific features
data['cumulative_events'] = data.groupby(['event_type']).cumcount() + 1
data['cumulative_fatalities'] = data.groupby(['event_type'])['fatalities'].cumsum()

log_data = data.copy()
# log tranformations
log_data['log_fatalities'] = np.log1p(log_data['fatalities'])
log_data['log_cumulative_events'] = np.log1p(log_data['cumulative_events'])
log_data['log_cumulative_fatalities'] = np.log1p(log_data['cumulative_fatalities'])
log_data['log_rolling_avg_fatalities_7d'] = np.log1p(log_data['rolling_avg_fatalities_7d'])
log_data['log_time_since_last_event'] = np.log1p(log_data['time_since_last_event'])
log_data['log_time_since_last_disorder'] = np.log1p(log_data['time_since_last_disorder'])
log_data['log_days_since_start'] = np.log1p(log_data['days_since_start'])

# Creating lagged features 
log_data['lag1_log_fatalities'] = log_data['log_fatalities'].shift(1)
log_data['lag2_log_fatalities'] = log_data['log_fatalities'].shift(2)
log_data['lag3_log_fatalities'] = log_data['log_fatalities'].shift(3)

log_data['lag1_log_cumulative_events'] = log_data['log_cumulative_events'].shift(1)
log_data['lag2_log_cumulative_events'] = log_data['log_cumulative_events'].shift(2)
log_data['lag3_log_cumulative_events'] = log_data['log_cumulative_events'].shift(3)

log_data['lag1_log_cumulative_fatalities'] = log_data['log_cumulative_fatalities'].shift(1)
log_data['lag2_log_cumulative_fatalities'] = log_data['log_cumulative_fatalities'].shift(2)
log_data['lag3_log_cumulative_fatalities'] = log_data['log_cumulative_fatalities'].shift(3)

log_data['lag1_log_rolling_avg'] = log_data['log_rolling_avg_fatalities_7d'].shift(1)
log_data['lag2_log_rolling_avg'] = log_data['log_rolling_avg_fatalities_7d'].shift(2)
log_data['lag3_log_rolling_avg'] = log_data['log_rolling_avg_fatalities_7d'].shift(3)

log_data['lag1_log_time_since_last_event'] = log_data['log_time_since_last_event'].shift(1)
log_data['lag2_log_time_since_last_event'] = log_data['log_time_since_last_event'].shift(2)
log_data['lag3_log_time_since_last_event'] = log_data['log_time_since_last_event'].shift(3)

log_data['lag1_log_time_since_last_disorder'] = log_data['log_time_since_last_disorder'].shift(1)
log_data['lag2_log_time_since_last_disorder'] = log_data['log_time_since_last_disorder'].shift(2)
log_data['lag3_log_time_since_last_disorder'] = log_data['log_time_since_last_disorder'].shift(3)

log_data['lag1_log_days_since_start'] = log_data['log_days_since_start'].shift(1)
log_data['lag2_log_days_since_start'] = log_data['log_days_since_start'].shift(2)
log_data['lag3_log_days_since_start'] = log_data['log_days_since_start'].shift(3)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
#print(log_data)
#print(log_data.isna().sum())

# percentage of zero values
zero_counts = (log_data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(log_data)
#print(zero_percentage[zero_percentage > 0])


In [11]:
#Cleaning log data

#replacing infinities w median
log_data.replace([np.inf, -np.inf], np.nan, inplace=True)
log_data.fillna(log_data.median(), inplace=True)

#forward + backward prop to fill 0's 
log_data['time_since_last_event'].fillna(method='ffill', inplace=True)
log_data['time_since_last_event'].fillna(method='bfill', inplace=True)

lag_cols = [col for col in log_data.columns if 'log_time_since_last_event' in col or 'lag' in col]
log_data[lag_cols] = log_data[lag_cols].fillna(method='ffill').fillna(method='bfill')

  log_data.fillna(log_data.median(), inplace=True)


In [12]:
#print(log_data)
#print(log_data.isna().sum())

# percentage of zero values
zero_counts = (log_data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(log_data)
print(zero_percentage[zero_percentage > 0])

inter2                                27.274875
fatalities                            91.047519
days_since_start                       0.034130
time_since_last_event                 86.823313
time_since_last_disorder              91.548963
rolling_avg_fatalities_7d             70.551326
cumulative_fatalities                 17.574166
log_fatalities                        91.047519
log_cumulative_fatalities             17.574166
log_rolling_avg_fatalities_7d         70.551326
log_time_since_last_event            100.000000
log_time_since_last_disorder         100.000000
log_days_since_start                   0.034130
lag1_log_fatalities                   91.047519
lag2_log_fatalities                   91.047519
lag3_log_fatalities                   91.047519
lag1_log_cumulative_fatalities        17.574166
lag2_log_cumulative_fatalities        17.574166
lag3_log_cumulative_fatalities        17.574166
lag1_log_rolling_avg                  70.551326
lag2_log_rolling_avg                  70

In [13]:
#Encoding

#One-Hot for Categoricals
categorical_cols = ['disorder_type', 'event_type', 'actor1', 'actor2', 'civilian_targeting',
                    'country', 'admin1', 'admin2', 'day_of_week']
log_data_encoded = pd.get_dummies(log_data, columns=categorical_cols)
log_data_encoded

#Label Encoder
label_encoders = {}

for col in ['inter1', 'inter2', 'interaction', 'sub_event_type']:
    le = LabelEncoder()
    log_data_encoded[col] = le.fit_transform(log_data_encoded[col])
    label_encoders[col] = le  # storing the encoder
#print(log_data_encoded.isna().sum())
log_data_encoded.to_csv('data/log_data_encoded.csv', index=False)

In [14]:
log_data_encoded

Unnamed: 0,event_date,year,sub_event_type,inter1,inter2,interaction,source,notes,fatalities,month,...,admin2_Tulkarm,admin2_Yizreel,admin2_Zefat,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,2024-04-19,2024,18,2,0,9,N12; Times of Israel,"Interception: On 19 April 2024, Hamas militant...",0,4,...,0,0,0,1,0,0,0,0,0,0
1,2024-04-19,2024,15,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,4,...,0,0,0,1,0,0,0,0,0,0
2,2024-04-19,2024,19,4,4,17,Haaretz; Jerusalem Post; N12; Times of Israel;...,"On 19 April 2024, dozens of relatives of hosta...",0,4,...,0,0,0,1,0,0,0,0,0,0
3,2024-04-19,2024,15,5,0,21,Ynet,"On 19 April 2024, thousands formed a human cha...",0,4,...,0,0,0,1,0,0,0,0,0,0
4,2024-04-19,2024,3,2,1,3,Jerusalem Post; Ma'ariv; Times of Israel,"On 19 April 2024, Hezbollah forces in Lebanon ...",0,4,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38125,2021-04-20,2021,14,7,6,26,PLO Negotiations Affairs Department,"Other: On 20 April 2021, Israeli military forc...",0,4,...,0,0,0,0,0,0,0,0,1,0
38126,2021-04-20,2021,12,4,0,16,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers uprooted 20...",0,4,...,0,0,0,0,0,0,0,0,1,0
38127,2021-04-20,2021,15,5,0,21,Ma'an News Agency,"On 20 April 2021, a large protest organized by...",0,4,...,0,0,0,0,0,0,0,0,1,0
38128,2021-04-20,2021,12,4,6,19,PLO Negotiations Affairs Department,"On 20 April 2021, Israeli settlers from the Ba...",0,4,...,0,0,0,0,0,0,0,0,1,0


In [15]:
# Correlation Analysis for numerical cols
numerical_cols = log_data_encoded.select_dtypes(include=['int64', 'float64']).columns
numerical_data = log_data_encoded[numerical_cols]
correlation_matrix = numerical_data.corr()

threshold = 0.85
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
to_drop

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))


['log_rolling_avg_fatalities_7d',
 'lag1_log_days_since_start',
 'lag2_log_days_since_start',
 'lag3_log_days_since_start']

In [None]:
#Mututal Information features

target = 'sub_event_type'

# taking out the text + datetime
features = log_data_encoded.drop(columns=[target, 'event_date', 'source', 'notes'])

# actual mi score calculation
mi_scores = mutual_info_classif(features, log_data_encoded[target], discrete_features='auto')

# putting in df
mi_df = pd.DataFrame({'Feature': features.columns, 'MI_Score': mi_scores})
mi_df.sort_values('MI_Score', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(x='MI_Score', y='Feature', data=mi_df.sort_values('MI_Score', ascending=False).head(20))
plt.title('Top 20 Features by Mutual Information')
plt.xlabel('Mutual Information Score')
plt.ylabel('Features')
plt.show()

In [None]:
#Random Forest Importance
#numeric_encoded_log_data
#Dropping numeric + T/T Splitting
X = log_data_encoded.drop(['sub_event_type', 'event_date', 'source', 'notes'], axis=1)
y = log_data_encoded['sub_event_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)

# feature importances into df
importances = forest.feature_importances_

rf_df = pd.DataFrame({'Feature': X_train.columns, 'RF_Importance': importances})
rf_df.sort_values('RF_Importance', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(x='RF_Importance', y='Feature', data=rf_df.sort_values('RF_Importance', ascending=False).head(20))
plt.title('Top 20 Features by Random Forest Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

In [None]:
#setting some threshold to see the "small" values
mi_threshold = 0.2
rf_threshold = 0.01

#mi scores
low_mi_features = mi_df[mi_df['MI_Score'] <= mi_threshold]

#rf importances
low_rf_features = rf_df[rf_df['RF_Importance'] <= rf_threshold]

# Find the intersection of low importance features from both methods
low_importance_features = pd.merge(low_mi_features, low_rf_features, on='Feature', how='inner')

print("Low importance features by both MI and RF methods:\n", low_importance_features)


In [None]:
rf_df = pd.DataFrame({'Feature': X_train.columns,
                                    'RF_Importance': forest.feature_importances_})

#merging datasets
combined_importances = pd.merge(mi_df, rf_df, on='Feature', how='outer')

combined_importances.sort_values(by='MI_Score', ascending=False, inplace=True)

combined_importances = pd.merge(mi_df, rf_df, on='Feature', how='outer')

#set the amount of feaatures
sorted_idx = combined_importances.sort_values(by='MI_Score', ascending=False)['Feature'].head(30)


plt.figure(figsize=(12, 8))
sns.barplot(x='MI_Score', y='Feature', data=combined_importances[combined_importances['Feature'].isin(sorted_idx)], 
            color='blue', label='MI Score')
sns.barplot(x='RF_Importance', y='Feature', data=combined_importances[combined_importances['Feature'].isin(sorted_idx)], 
            color='red', alpha=0.6, label='Random Forest Importance')
plt.title('Comparison of Feature Importance by MI and Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.legend()
plt.show()
