In [1]:
from functools import reduce

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from scipy.stats.mstats import winsorize
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [None]:
"""
A deep learning neural network is used to classify terrorist attacks between 1970-2017 with 
unknown terrorist group affiliations. The model is trained on data provided by the Global Terrorism Database.
The feature space includes categorical variables such as type of weapon used, victim type, suicide attacks, as well 
as numerical variables such as hostages taken, number of attackers, and number of fatalities.

The model's accuracy on the validation test is approximately 60%.

-Adam Wu

Data:
National Consortium for the Study of Terrorism and Responses to Terrorism (START). (2018). 
Global Terrorism Database [Data file]. Retrieved from https://www.start.umd.edu/gtd
"""

In [2]:
# Reads the data and sets the index to 'eventid'. Missing values are encoded as -9,-99 according to database codebook.
df = pd.read_csv('Data/globalterrorismdb_0718dist.csv', header=0, na_values = ['-9','-99']).set_index('eventid')


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# There are lots of missing data in the dataset and some variables only started recording in 1997.
# Dropping all NaNs would result in significant loss of data, so impute missing values and reduce dimension of feature space.

# However, for variables with significant amount of NaNs, imputing/aggregation may impose unrealistic assumptions about
# the distributions of the variables and introduce bias. So variables with over 75% values NaN are excluded.
pct_na = df.isna().sum()/len(df)
ext_na = pct_na[~(pct_na > 0.75)]
df = df.loc[:,ext_na.index.values]

# For categorical variables, impute NaN with most common value (mode)
cat = df.select_dtypes('int64')
cat = cat.fillna(cat.mode())

# For numerical variables, impute NaN with the mean. Since the mean is sensitive to large values,
# winsorize (1%) to limit effect of extreme outliers
num = df.select_dtypes('float64')
num_no_nan = num.dropna()
num_winsor = winsorize(num_no_nan, limits = [0.01, 0.01])
num_mean = pd.DataFrame(num_winsor, index = num_no_nan.index, columns = num_no_nan.columns).mean()
num = num.fillna(num_mean)
num = pd.DataFrame(winsorize(num, limits = [0.01, 0.01]), index = num.index, columns = num.columns)



In [4]:
# The model will be trained on a mixed feature space with both categorical and numerical variables.
# Normalizing the feature variances may be useful and make the model more robust.

# Since categorical variables have varied scaling (1-k), they are split into k-vectors with binary components.
# In other words, N categorical variables with ranges of (k_1,...,k_N) are projected 
# onto an N*k-dimensional space with unit norm.
cat_list = []
for col in cat:
    cat_list.append(pd.get_dummies(cat[col], prefix=str(col)))

cat_df = reduce(lambda x,y: pd.merge(x,y,on='eventid'), cat_list)

# Numerical variables are normalized to [0,1] so that they have similar scaling with the categorical variables
num_scaled = (num - num.min(axis=0))/(num.max(axis=0) - num.min(axis=0))
num_df = pd.DataFrame(num_scaled, index = num.index, columns = num.columns)

df_clean = cat_df.merge(num_df, how='inner', on='eventid')

In [5]:
# Indices of attacks with unknown terrorist affiliations. These are the attacks that will be classified
# and predicted by the model.
unknown = df[df['gname']=='Unknown'].index.values

# Processed dataset of unknown attacks to classify
X_to_classify = df_clean.loc[unknown,:]

# Processed dataset of known attacks to train/test the model
X_model = df_clean.drop(labels=unknown, axis=0)

In [6]:
# Converts terrorist group names into dummy variables
terrorist_groups = df.loc[:,'gname']
terrorist_groups_dummy = pd.get_dummies(terrorist_groups)

# Unknown terrorist groups (in dummy form) to classify
Y_to_classify = terrorist_groups_dummy.loc[unknown,:]

# Known terrorist groups (in dummy form) to train/test the model
Y_model = terrorist_groups_dummy.drop(labels=unknown, axis=0)

In [11]:
# Model specifications

# Dimension of feature space
n_features = X_model.shape[1]
n_groups = Y_model.shape[1]

# Sets up a sequential neural network
model = Sequential()

# Input Layer
model.add(Dense(500, activation='relu', input_shape=(n_features,)))

# Hidden Layers
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))

# Output Layer. Activation function is softmax so it returns probabilities for classification.
model.add(Dense(n_groups, activation='softmax'))

# Compilation
model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
)

# Stop the training process early if performance does not improve 
early_stop = EarlyStopping(patience=3)

# Fit the model
model.fit(X_model, Y_model,
          epochs=20,
          validation_split=0.2,
          callbacks=[early_stop,]
         )


Train on 79127 samples, validate on 19782 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x1e07d074b38>

In [15]:
# Model predictions
predictions = model.predict(X_to_classify)

predict_df = pd.DataFrame(predictions, index=X_to_classify.index)


In [35]:
# Makes predictions for terrorist group affiliation based on maximum likelihood
predicted_group = predict_df.idxmax(axis=1)
predicted_group.head()

reconstruct_group = []
for row in predicted_group:
    reconstruct_group.append(terrorist_groups_dummy.columns[row])
    
final_predictions = pd.DataFrame(reconstruct_group, index=predicted_group.index)
final_predictions.columns = ['Predicted Terrorist Group Affiliation']

final_predictions.tail(20)

Unnamed: 0_level_0,Predicted Terrorist Group Affiliation
eventid,Unnamed: 1_level_1
201712290025,Muslim extremists
201712290026,Fulani extremists
201712300002,Sunni Muslim extremists
201712300003,Sunni Muslim extremists
201712300004,Muslim extremists
201712300005,Sunni Muslim extremists
201712300006,Guerrillas
201712300007,Muslim extremists
201712300010,Muslim Guerrillas
201712300018,Donetsk People's Republic
