# Questions
## 1. Which attack type would be used in the next global terrorist attack?
## 2. Which variables best predict attack type?

### Import Packages and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
data = pd.read_csv("~/Desktop/Stats_131_Project/globalterrorism.csv", encoding='ISO-8859-1', low_memory = False)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

### Data Cleaning

#### Remove Columns That Are Not Relevant in Predicting Attack Type Based On Our Intuitions

In [None]:
data2= data[["eventid", "iyear", "imonth", "crit1", "crit2", \
             "crit3", "doubtterr", "multiple", "country", \
             "country_txt", "region", "region_txt", "attacktype1", \
             "attacktype1_txt", "success", "weaptype1", \
             "weaptype1_txt", "suicide", \
             "targtype1", "targtype1_txt", "target1", \
             "natlty1", "gname", \
             "INT_LOG", "INT_IDEO", "latitude", "longitude"]]

#### Remove Observations Whose Exact Month of Incident is Unknown (imonth = 0)

In [None]:
data2 = data2[data2["imonth"] != 0]

In [None]:
print(data2.imonth.unique())

#### Remove Observations Whose Country is Unknown (country = Unknown)

In [None]:
data2 = data2[data2["country_txt"] != "Unknown"]

In [None]:
data2.isnull().sum()

In [None]:
data2.info()

#### Remove Observations Whose Attack Type is Unknown (attacktype1_txt = Unknown)

In [None]:
data2 = data2[data2["attacktype1_txt"] != "Unknown"]

#### Remove Observations Whose Weapon Type is Unknown (weaptype1_txt = Unknown)

In [None]:
data2 = data2[data2["weaptype1_txt"] != "Unknown"]

### Exploratory Analysis

#### Checking If the Categories With Smaller Counts Have Sufficient Observations for Modeling

In [None]:
print(len(data2[data2["attacktype1_txt"] == "Unarmed Assault"]))
print(len(data2[data2["attacktype1_txt"] == "Hijacking"]))
print(len(data2[data2["attacktype1_txt"] == "Hostage Taking (Barricade Incident)"]))

#### Examining the Distribution of Attack Type

In [None]:
plt.subplots(figsize=(15, 6))
sns.countplot(data2.attacktype1_txt, order = data2.attacktype1_txt.value_counts().index)
plt.xticks(rotation = 90)
plt.title("Barplot of Attack Type")
plt.xlabel("Attack Type")
plt.show()

From the barplot, terrorist attacks in the form of bombing/ explosion happened with the highest frequency, followed by armed assault and assassination. Terrorist attacks in the form of facility/ infrastructure attack and hostage taking (kidnapping) also occurred with moderate frequencies. Hijacking, unarmed assault, and hostage taking (barricade incident) happened relatively less frequently.

#### Examining the Trend of Terrorist Attack

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.iyear)
plt.xticks(rotation = 90)
plt.title("Number of Terrorist Attacks Each Year (1970 - 2016)")
plt.xlabel("Year")
plt.show()

The plot shows a clear increasing trend in the number of terrorist attacks, peaking in 2014

According to the plot, the distribution of terrorist attacks by month is relatively uniform 

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.imonth)
plt.xticks(rotation = 90)
plt.title("Number of Terrorist Attacks Each Month")
plt.xlabel("Month")
plt.show()

#### Examining Distribution of Terrorist Attacks By Year and Regions

In [None]:
data2.pivot_table(index = "region_txt", columns = "iyear", values = "eventid", aggfunc = "count", fill_value = 0)

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.region_txt[data2.iyear.isin(np.arange(2004, 2017, 1))],\
              hue = data2.iyear[data2.iyear.isin(np.arange(2004, 2017, 1))],\
              order = data2.region_txt[data2.iyear.isin(np.arange(2004, 2017, 1))].value_counts().index)
plt.xticks(rotation = 90)
plt.legend(loc = "upper right")
plt.title("Distribution of Terrorist Attacks by Year and Region")
plt.xlabel("Region")
plt.show()

#### Examining Distribution of Attack Types By Year

In [None]:
data2.pivot_table(index = "attacktype1_txt", columns = "iyear", values = "eventid", aggfunc = "count", fill_value = 0)

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.attacktype1_txt[data2.iyear.isin(np.arange(2004, 2017, 1))],\
              hue = data2.iyear[data2.iyear.isin(np.arange(2004, 2017, 1))])
plt.xticks(rotation = 90)
plt.legend(loc = "upper right")
plt.title("Distribution of Attack Type By Year")
plt.xlabel("Attack Type")
plt.show()

#### Examining Distribution of Attack Types By Month

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.attacktype1_txt,\
              hue = data2.imonth)
plt.xticks(rotation = 90)
plt.legend(loc = "upper right")
plt.title("Distribution of Attack Type By Month")
plt.xlabel("Attack Type")
plt.show()

#### Examining Distribution of Terrorist Attacks by Month and Regions

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(data2.region_txt,\
              hue = data2.imonth)
plt.xticks(rotation = 90)
plt.legend(loc = "upper right")
plt.title("Distribution of Terrorist Attacks By Month and Regions")
plt.xlabel("Region")
plt.show()

The distribution is relatively uniform for most the regions, but there may be some trends for regions such as South America, Middle East and North Africa, and South Asia

Also, there are usually spikes in every other month starting from January, suggesting a possibility of patterns in terrorist attacks

### Multinomial Logistic Regression

#### Creating X and y Variables

In [None]:
X_temp = data2.loc[:,["imonth", "crit1", "crit2", "crit3", "multiple", \
          "country_txt", "region_txt", "weaptype1_txt", \
          "suicide", "targtype1_txt", "target1", \
          "gname", "attacktype1_txt"]]
X_temp2 = X_temp.dropna()
X = X_temp2.loc[:,["imonth", "crit1", "crit2", "crit3", "multiple", \
          "country_txt", "region_txt", "weaptype1_txt", \
          "suicide", "targtype1_txt", "target1", \
          "gname"]]

In [None]:
X["target1"] = X["target1"].astype(str)
X["gname"] = X["gname"].astype(str)

In [None]:
y_temp = X_temp2.loc[:,"attacktype1_txt"]

In [None]:
group = y_temp.unique()
print(group)

In [None]:
# Binarize the output
y = label_binarize(y_temp, classes=group)
n_classes = y.shape[1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 40)

In [None]:
print(y_train.shape)
print(X_train.shape)

In [None]:
print(y_temp.head())
print(y)

In [None]:
le=LabelEncoder()

# Iterating over all the common columns in train and test
for col in X_test.columns.values:
    # Encoding only categorical variables
    if X_test[col].dtypes=='object':
    # Using whole data to form an exhaustive list of levels
        data3 = X_train[col].append(X_test[col])
        le.fit(data3.values)
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

In [None]:
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

logreg = LogisticRegression(multi_class='ovr')

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(confusion_matrix(y_test, y_pred))

accuracy_score(y_test,logreg.predict(X_test))

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:,1]
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))