In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import time
from pandas import scatter_matrix
import matplotlib.pyplot as plt
from collections import Counter

import sklearn
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# reads in full 1970-2016 Global Terrorism Database (GTD)
df = pd.read_csv("qfactors_terrorism_unknowns.csv")

# training set of observations with known group
df_train = df[((df['gname']) != 'Unknown')]
# test set of observations with Unknown group
df_test = df[(df['gname']) == 'Unknown']

df_test.head(3)

Unnamed: 0,eventid,gname,iyear,extended,country,country_txt,city,latitude,longitude,success,...,education,freedom,gni_per_cap,gov_conf,immigrants,life_exp,pop_den,pov_gap,refugees,primary_enroll
0,199001000001,Unknown,1990,0,110,Lebanon,Beirut,33.888629,35.495479,1,...,0.631297,6.0,6190.0,0.0,523693.0,70.221146,264.224438,0.0,304599.0,105.793974
1,199001010001,Unknown,1990,0,92,India,Srinagar,34.083658,74.797368,1,...,0.31067,2.0,1120.0,47.0,7493204.0,57.926537,292.659897,6.520666,212743.0,91.833908
2,199001010002,Unknown,1990,0,92,India,Srinagar,34.093665,74.787832,1,...,0.31067,2.0,1120.0,47.0,7493204.0,57.926537,292.659897,6.520666,212743.0,91.833908


In [3]:
# features available:
#features = ['iyear', 'education', 'freedom', 'gni_per_cap', 'gov_conf', 'immigrants', 'life_exp', 'pop_den', 'pov_gap', 'refugees', 'primary_enroll']
features = ['education', 'pop_den', 'freedom', 'life_exp', 'pov_gap', 'primary_enroll']

# extract test sets
target = ['gname']
Y_train = df_train[target]
Y_test = df_test[target]

# extract training sets
X_train = df_train[features]
X_test = df_test[features]

#corr_matrix = df[['country', 'iyear', 'freedom', 'life_exp', 'pov_gap', 'primary_enroll']]
#corr_matrix.corr()

In [4]:
# refit training sets after PCA
pca = PCA(n_components=3)
pca_train_features = pca.fit_transform(X_train)
pca_test_features = pca.fit_transform(X_test)

In [5]:
time_ind_model = RandomForestClassifier()
time_ind_model.fit(X_train, Y_train)

predictions = time_ind_model.predict(X_test)

In [6]:
count = Counter(predictions)
print('Number of terrorist groups: {}'.format(len(count)))

high = count.most_common(1)[0]
print('Most common group observed: {} = {:.2f}%'.format(high[0], 100 * high[1] / predictions.size))
count.most_common(10)

Number of terrorist groups: 560
Most common group observed: Islamic State of Iraq and the Levant (ISIL) = 14.97%


[('Islamic State of Iraq and the Levant (ISIL)', 9400),
 ('Tehrik-i-Taliban Pakistan (TTP)', 8373),
 ('Taliban', 4200),
 ('Al-Qaida in Iraq', 3611),
 ('Islamic State of Iraq (ISI)', 2115),
 ("New People's Army (NPA)", 1832),
 ('Maoists', 1608),
 ('Separatists', 1535),
 ('Gunmen', 1346),
 ("Kurdistan Workers' Party (PKK)", 890)]

In [7]:
df_test.head(5)

Unnamed: 0,eventid,gname,iyear,extended,country,country_txt,city,latitude,longitude,success,...,education,freedom,gni_per_cap,gov_conf,immigrants,life_exp,pop_den,pov_gap,refugees,primary_enroll
0,199001000001,Unknown,1990,0,110,Lebanon,Beirut,33.888629,35.495479,1,...,0.631297,6.0,6190.0,0.0,523693.0,70.221146,264.224438,0.0,304599.0,105.793974
1,199001010001,Unknown,1990,0,92,India,Srinagar,34.083658,74.797368,1,...,0.31067,2.0,1120.0,47.0,7493204.0,57.926537,292.659897,6.520666,212743.0,91.833908
2,199001010002,Unknown,1990,0,92,India,Srinagar,34.093665,74.787832,1,...,0.31067,2.0,1120.0,47.0,7493204.0,57.926537,292.659897,6.520666,212743.0,91.833908
3,199001010003,Unknown,1990,0,92,India,Srinagar,34.069935,74.793683,1,...,0.31067,2.0,1120.0,47.0,7493204.0,57.926537,292.659897,6.520666,212743.0,91.833908
6,199001010006,Unknown,1990,0,97,Israel,Tel Aviv,32.085688,34.783684,0,...,0.713644,2.0,14510.0,0.0,1632704.0,76.607317,215.341959,0.0,22039.927193,95.370171


In [8]:
df_test['gname'] = predictions

df_test.to_csv("gname_predictions.csv", encoding='utf-8', index=False)