In [None]:
#!pip uninstall tensorflow -y
#!pip install tensorflow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import tensorflow as tf


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

#from statsmodels.tsa.stattools import adfuller


#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding

Found existing installation: tensorflow 2.13.1
Uninstalling tensorflow-2.13.1:
  Successfully uninstalled tensorflow-2.13.1
Collecting tensorflow
  Downloading tensorflow-2.13.1-cp38-cp38-macosx_10_15_x86_64.whl.metadata (3.2 kB)
Downloading tensorflow-2.13.1-cp38-cp38-macosx_10_15_x86_64.whl (216.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: tensorflow
Successfully installed tensorflow-2.13.1


In [None]:
#Importing Data
data = pd.read_csv("data/ACLED2021-2024.csv")


data.drop(columns=['time_precision', 'assoc_actor_1', 'assoc_actor_2', 'iso', 'region', 'admin3', 'location', 
                  'latitude', 'longitude', 'geo_precision', 'source_scale', 'timestamp', 'tags', 'population_best', 'event_id_cnty'], 
          inplace=True)
print(data.shape)


data['event_date'] = pd.to_datetime(data['event_date'], errors = 'coerce') #changing to datetime
#data.set_index('event_date', inplace=True)
#data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

print(data.shape)
#data = data.dropna(subset=['event_date'])

38130 rows × 32 columns - Original Dataset Dimensions

In [None]:
# Dropping Dupes
initial_row_count = data.shape[0]
data = data.drop_duplicates()
final_row_count = data.shape[0]
print(f"Removed {initial_row_count - final_row_count} duplicates")
print(data.shape)

In [None]:
#Reformatting / Cleaning

#Addressing NA values - Categorical, NUmerical and date
categorical_columns = ['disorder_type', 'event_type', 'sub_event_type', 'actor1', 'actor2', 'civilian_targeting', 
                       'country', 'admin1', 'admin2', 'source', 'notes']
categorical_columns = data[categorical_columns]


for column in categorical_columns:
    data[column] = data[column].fillna('Not specified')

numerical_columns = ['fatalities', 'inter1', 'inter2', 'interaction']

for column in numerical_columns:
    data[column] = data[column].fillna(data[column].median()) #using median to fill
    
print(data.isna().sum())
print(data.shape)
#data.loc[:, 'actor2'] = data['actor2'].fillna('Not specified')
#data.loc[:, 'civilian_targeting'] = data['civilian_targeting'].fillna('Not specified')

In [None]:
data

In [None]:
#Grouping Together Actor1 and 2
def consolidate_names(name):
    if 'Military Forces of Israel' in name:
        return 'Military Forces of Israel'
    elif 'Police Forces of Israel' in name:
        return 'Police Forces of Israel'
    elif 'Hamas Movement' in name:
        return 'Hamas Movement'
    elif 'Police Forces of Israel' in name or 'Government of Israel' in name:
        return 'Government and Police Forces of Israel'
    elif 'Police Forces of Palestine' in name or 'Government of Palestine' in name:
        return 'Government and Police Forces of Palestine'
    elif 'PIJ:' in name or 'Islamic Jihad' in name:
        return 'Palestinian Islamic Jihad'
    elif 'Hezbollah' in name:
        return 'Hezbollah'
    elif 'Al Aqsa' in name:
        return 'Al Aqsa Martyrs Brigade'
    elif 'Katibat' in name:
        return 'Katibat Groups (Palestine)'
    elif 'PFLP:' in name:
        return 'Popular Front for the Liberation of Palestine'
    elif 'DFLP:' in name:
        return 'Democratic Front for the Liberation of Palestine'
    elif 'Military Forces of Iran' in name:
        return 'Iranian Revolutionary Guard Corps'
    elif 'Islamic State' in name:
        return 'Islamic State'
#civilians
    elif 'Civilians' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  #Keeping isr and pal civilians
        else:
            return 'Civilians (International)'  # grouping others as int.
#armed groups
    elif 'Unidentified Armed Group' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Unidentified Armed Group (International)'
#military forces
    elif 'Military Forces of' in name:
        if 'Israel' in name or 'Palestine' in name:
            return name  
        else:
            return 'Military Forces of International Forces'
    elif 'Settlers' in name:
        return 'Settlers (Israel)'
    elif 'Protesters' in name or 'Rioters' in name:
        return name  # Retains specific categories due to their distinct contexts
    else:
        return name #'Other Groups' 

# Apply the consolidation function to both actor1 and actor2
data['actor1_grouped'] = data['actor1'].apply(consolidate_names)
data['actor2_grouped'] = data['actor2'].apply(consolidate_names)

In [None]:
#Grouping smaller entities
actor1_counts = data['actor1_grouped'].value_counts()
actor2_counts = data['actor2_grouped'].value_counts()

def consolidate_small_groups(name, counts): #Check if Isr or Pal if not 'name'
    if counts[name] < 10:
        if 'Israel' in name:
            return 'Other (Israel)'
        elif 'Palestine' in name:
            return 'Other (Palestine)'
        else:
            return'Other Group'
    else:
        # Return the name if the count is 10 or more
        return name

# Apply the consolidation function to both actor1_grouped and actor2_grouped
data['actor1_grouped'] = data['actor1_grouped'].apply(lambda x: consolidate_small_groups(x, actor1_counts))
data['actor2_grouped'] = data['actor2_grouped'].apply(lambda x: consolidate_small_groups(x, actor2_counts))


# Print the new value counts to confirm re-categorization
#print(data['actor1_grouped'].value_counts())
#print(data['actor2_grouped'].value_counts())

data['actor1'] = data['actor1_grouped']
data['actor2'] = data['actor2_grouped']

data.drop(['actor1_grouped', 'actor2_grouped'], axis=1, inplace=True)


In [None]:
print(data.isna().sum())


In [None]:
# Calculate the percentage of zero values per column
zero_counts = (data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(data)

# Display columns with high percentages of zeros
print("Percentage of zeros in each column:")
print(zero_percentage[zero_percentage > 0])  # Adjust the threshold as necessary


In [None]:
#Creating Temporal Features for T-S

#Date related
data['year'] = data['event_date'].dt.year
data['month'] = data['event_date'].dt.month
data['day'] = data['event_date'].dt.day
data['day_of_week'] = data['event_date'].dt.day_name()
data['days_since_start'] = (data['event_date'] - data['event_date'].min()).dt.days

# time since last event of the same type
data['time_since_last_event'] = data.groupby('event_type')['event_date'].diff().dt.days
#data['time_since_last_event'] = data.groupby('event_type').apply(lambda x: x.index.to_series().diff().dt.days).reset_index(level=0, drop=True)


# time since last disorder of the same type
data['time_since_last_disorder'] = data.groupby('disorder_type')['event_date'].diff().dt.days
#data['time_since_last_disorder'] = data.groupby('disorder_type').apply(lambda x: x.index.to_series().diff().dt.days).reset_index(level=0, drop=True)
#print(data[['time_since_last_event', 'time_since_last_disorder']].head())


# rolling avg for fatalities
data['rolling_avg_fatalities_7d'] = data.groupby(
    'event_type')['fatalities'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())


# cumulative counts of events and fatalities by specific features
data['cumulative_events'] = data.groupby(['event_type']).cumcount() + 1
data['cumulative_fatalities'] = data.groupby(['event_type'])['fatalities'].cumsum()

log_data = data.copy()


In [None]:
#Log transformations
log_columns = ['fatalities', 'cumulative_events', 'cumulative_fatalities', 'rolling_avg_fatalities_7d', 
               'time_since_last_event', 'time_since_last_disorder', 'days_since_start']
for col in log_columns:
    log_data['log_' + col] = np.log1p(log_data[col])

# Creating lagged features
for col in log_columns:
    log_col = 'log_' + col
    for lag in [1, 2, 3]:
        log_data[f'{log_col}_lag{lag}'] = log_data[log_col].shift(lag)

In [None]:
log_data

In [None]:
#print(log_data)
#print(log_data.isna().sum())

# percentage of zero values
zero_counts = (log_data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(log_data)
#print(zero_percentage[zero_percentage > 0])


In [None]:
#Cleaning log data

#replacing infinities w median
log_data.replace([np.inf, -np.inf], np.nan, inplace=True)
log_data.fillna(log_data.median(), inplace=True)

#forward + backward prop to fill 0's 
log_data['time_since_last_event'].fillna(method='ffill', inplace=True)
log_data['time_since_last_event'].fillna(method='bfill', inplace=True)

lag_cols = [col for col in log_data.columns if 'log_time_since_last_event' in col or 'lag' in col]
log_data[lag_cols] = log_data[lag_cols].fillna(method='ffill').fillna(method='bfill')

In [None]:
#print(log_data)
print(log_data.isna().sum())

# percentage of zero values
zero_counts = (log_data == 0).astype(int).sum(axis=0)
zero_percentage = 100 * zero_counts / len(log_data)
print(zero_percentage[zero_percentage > 0])

In [None]:
#Encoding

#One-Hot for Categoricals
categorical_cols = ['disorder_type', 'event_type', 'actor1', 'actor2', 'civilian_targeting',
                    'country', 'admin1', 'admin2', 'day_of_week']
log_data_encoded = pd.get_dummies(log_data, columns=categorical_cols)
log_data_encoded

#Label Encoder
label_encoders = {}

for col in ['inter1', 'inter2', 'interaction', 'sub_event_type']:
    le = LabelEncoder()
    log_data_encoded[col] = le.fit_transform(log_data_encoded[col])
    label_encoders[col] = le  # storing the encoder
#print(log_data_encoded.isna().sum())
log_data_encoded.to_csv('data/log_data_encoded.csv', index=False)

In [None]:
log_data_encoded

In [None]:
# Correlation Analysis for numerical cols
numerical_cols = log_data_encoded.select_dtypes(include=['int64', 'float64']).columns
numerical_data = log_data_encoded[numerical_cols]
correlation_matrix = numerical_data.corr()

threshold = 0.85
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
to_drop

In [None]:
#Mututal Information features

target = 'sub_event_type'

# taking out the text + datetime
features = log_data_encoded.drop(columns=[target, 'event_date', 'source', 'notes'])

# actual mi score calculation
mi_scores = mutual_info_classif(features, log_data_encoded[target], discrete_features='auto')

# putting in df
mi_df = pd.DataFrame({'Feature': features.columns, 'MI_Score': mi_scores})
mi_df.sort_values('MI_Score', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(x='MI_Score', y='Feature', data=mi_df.sort_values('MI_Score', ascending=False).head(20))
plt.title('Top 20 Features by Mutual Information')
plt.xlabel('Mutual Information Score')
plt.ylabel('Features')
plt.show()

In [None]:
#Random Forest Importance
#numeric_encoded_log_data
#Dropping numeric + T/T Splitting
X = log_data_encoded.drop(['sub_event_type',  'event_date', 'source', 'notes'], axis=1)
y = log_data_encoded['sub_event_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)

forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)

# feature importances into df
importances = forest.feature_importances_

rf_df = pd.DataFrame({'Feature': X_train.columns, 'RF_Importance': importances})
rf_df.sort_values('RF_Importance', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(x='RF_Importance', y='Feature', data=rf_df.sort_values('RF_Importance', ascending=False).head(20))
plt.title('Top 20 Features by Random Forest Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

In [None]:
print(mi_df.shape)
print(rf_df.shape)

In [None]:
rf_df = pd.DataFrame({'Feature': X_train.columns,
                                    'RF_Importance': forest.feature_importances_})

#merging datasets
combined_importances = pd.merge(mi_df, rf_df, on='Feature', how='outer')

combined_importances.sort_values(by='MI_Score', ascending=False, inplace=True)

combined_importances = pd.merge(mi_df, rf_df, on='Feature', how='outer')

#set the amount of feaatures
sorted_idx = combined_importances.sort_values(by='MI_Score', ascending=False)['Feature'].head(30)


plt.figure(figsize=(12, 8))
sns.barplot(x='MI_Score', y='Feature', data=combined_importances[combined_importances['Feature'].isin(sorted_idx)], 
            color='blue', label='MI Score')
sns.barplot(x='RF_Importance', y='Feature', data=combined_importances[combined_importances['Feature'].isin(sorted_idx)], 
            color='red', alpha=0.6, label='Random Forest Importance')
plt.title('Comparison of Feature Importance by MI and Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.legend()
plt.show()


In [None]:
#setting arbitrary threshold to see the "small" values
test_thresh1 = 0.2
test_thresh2 = 0.01

#mi scores
low_mi_features = mi_df[mi_df['MI_Score'] <= test_thresh1]

#rf importances
low_rf_features = rf_df[rf_df['RF_Importance'] <= test_thresh2]

# intersection of low importance features from both
test_features = pd.merge(low_mi_features, low_rf_features, on='Feature', how='inner')

print(test_features)


In [None]:
# MI score
mi_percentile = 50  # aka keeping top 50%
mi_threshold = np.percentile(mi_df['MI_Score'], mi_percentile)
#top_mi_features = mi_df[mi_df['MI_Score'] >= mi_threshold]
selected_mi_features = mi_df[mi_df['MI_Score'] >= mi_threshold]['Feature'].tolist()
#top_mi_features.shape
#print(selected_mi_features)

# random forest importance
rf_percentile = 50  # aka keeping top 50% | top 30% is 43 features
rf_threshold = np.percentile(rf_df['RF_Importance'], rf_percentile)
#top_rf_features = rf_df[rf_df['RF_Importance'] >= rf_threshold]
selected_rf_features = rf_df[rf_df['RF_Importance'] >= 
                                      rf_threshold]['Feature'].tolist()
#top_rf_features.shape
#print(selected_rf_features)

#Combining into 1
selected_features = list(set(selected_mi_features) & set(selected_rf_features))


In [None]:
#top_features_combined = pd.merge(top_mi_features, top_rf_features, on='Feature', how='inner')
#print(top_features_combined)
#top_features_combined.to_csv('data/top_features_combined.csv', index=False)

In [None]:
log_data_encoded

In [None]:
#Selected Features
X_selected = log_data_encoded[selected_features]
y = log_data_encoded['sub_event_type']

X_selected_train, X_selected_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42
                                                                      , shuffle=False)

#REgular model minus text
X = log_data_encoded.drop(['sub_event_type', 'event_date', 'source', 'notes'], axis=1) # 'event_date', 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [None]:
y_test

In [None]:
def naive_forecast_last(train_data):
    return train_data.iloc[-1]

last_value = naive_forecast_last(y_train)  # Get the last value from the training target
predictions = [last_value] * len(y_test)  # Create a list of predictions for the test set

# Evaluate the prediction
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of Naïve Forecast: {accuracy:.2f}")

In [None]:
def naive_forecast_last(train_data):
    return train_data.iloc[-1]

last_value = naive_forecast_last(y_train)  # Get the last value from the training target
predictions = [last_value] * len(y_test)  # Create a list of predictions for the test set

# Evaluate the prediction
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of Naïve Forecast: {accuracy:.2f}")


In [None]:
#Decision Tree Selected 

tree_classifier = DecisionTreeClassifier(random_state=42)

tree_classifier.fit(X_selected_train, y_train)

tree_predictions = tree_classifier.predict(X_selected_test)

tree_accuracy = accuracy_score(y_test, tree_predictions)
tree_classification_report = classification_report(y_test, tree_predictions)
print(f"Accuracy of Decision Tree Selected: {tree_accuracy:.2f}")
print(tree_classification_report)

In [None]:
#Decision Tree Baseline
tree_classifier = DecisionTreeClassifier(random_state=42)

tree_classifier.fit(X_train, y_train)

tree_predictions = tree_classifier.predict(X_test)

tree_accuracy = accuracy_score(y_test, tree_predictions)
tree_classification_report = classification_report(y_test, tree_predictions)
print(f"Accuracy of Decision Tree Selected: {tree_accuracy:.2f}")
print(tree_classification_report)


In [None]:
# Logistic Regression Selected

logistic_regressor = LogisticRegression(max_iter=1000, random_state=42)

logistic_regressor.fit(X_train, y_train)

logistic_predictions = logistic_regressor.predict(X_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_classification_report = classification_report(y_test, logistic_predictions)

print(f"Accuracy of Logistic Regression Baseline: {logistic_accuracy:.2f}")
print(logistic_classification_report)

In [None]:
# Logistic Regression Baseline

logistic_regressor = LogisticRegression(max_iter=1000, random_state=42)

logistic_regressor.fit(X_selected_train, y_train)

logistic_predictions = logistic_regressor.predict(X_selected_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_classification_report = classification_report(y_test, logistic_predictions)

print(f"Accuracy of Logistic Regression Baseline: {logistic_accuracy:.2f}")
print(logistic_classification_report)

In [None]:

numeric_log_data = log_data_encoded.drop(columns=[target,'source', 'notes'])

def test_stationarity(series):
    result = adfuller(series.dropna(), autolag='AIC') 
    return {"Test Statistic": result[0], "p-value": result[1], "Critical Values": result[4]}

results = {column: test_stationarity(numeric_log_data[column]) for column in numeric_log_data.columns}

results


In [None]:
# differencing to 'year', 'month', and 'days_since_start'
numeric_log_data['year_diff'] = numeric_log_data['year'].diff()
numeric_log_data['month_diff'] = numeric_log_data['month'].diff()
numeric_log_data['days_since_start_diff'] = numeric_log_data['days_since_start'].diff()

#data = data.dropna(subset=['year_diff', 'month_diff', 'days_since_start_diff'])

# testing stationarity after differencing
new = {
    'year_diff': test_stationarity(numeric_log_data['year_diff']),
    'month_diff': test_stationarity(numeric_log_data['month_diff']),
    'days_since_start_diff': test_stationarity(numeric_log_data['days_since_start_diff'])
}

new


Everything is now stationary

In [None]:
# converting to categorical
y_train_encoded = tf.keras.utils.to_categorical(y_train)
y_test_encoded = tf.keras.utils.to_categorical(y_test)

#  LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(np.unique(y)), output_dim=100, input_length=X_train.shape[1]))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(y_train_encoded.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1)

#eval
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f'Test Accuracy: {accuracy:.2f}')
