# Global Terrorism Data

# Predictive Modeling -> Decision Tree

In [1]:
import warnings
import os
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

### Read Downsampled Data


In [2]:
def convert_text_to_sequences(df_train, df_test):
    # Convert text to sequences for each dataset
    y_train = np.array(df_train['enc_group'])
    X_train = df_train.drop(columns=["enc_group"])

    #y_val =np.array(df_val['enc_group'])
    #X_val = df_val.drop(columns=["enc_group"])

    y_test = np.array(df_test['enc_group'])
    X_test = df_test.drop(columns=["enc_group"])

    return X_train, y_train, X_test, y_test

def read_and_split_data(partitions, paths):
    train_datasets = []
    test_datasets= []
    X_trains = []
    y_trains = []
    X_tests = []
    y_tests = []
    # Loop through each partition to process data
    for i, partition in enumerate(partitions):
        print("ITERATION: ", i)
        # Load train and validation data
        data = pd.read_csv(paths[i], encoding='ISO-8859-1')
        data = data.drop(columns=['attack_date'])
        y = np.array(data["enc_group"])
        x = data.drop(columns=["enc_group"])
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
        #val_data = pd.read_csv(val_paths[i], encoding='ISO-8859-1')
        #combined = pd.concat([train_data, val_data])
        #print(combined.isnull().sum())
        
        # Concatenate train and validation data
        #train_data = combined

        # Load and use the test data directly
        #test_data = pd.read_csv(test_paths[i], encoding='ISO-8859-1')
        #combined = combined.drop(columns=["enc_weapon_subtype"])
        #test_data = test_data.drop(columns=["enc_weapon_subtype"])

        #train_datasets.append(train_data)
        #test_datasets.append(test_data)
        #X_train, y_train, X_test, y_test = convert_text_to_sequences(combined, test_data)

        if 'entity' in X_train.columns:
            X_train.drop(columns='entity', inplace=True)
        if 'entity' in X_test.columns:
            X_test.drop(columns='entity', inplace=True)       
        median_train = X_train.median()
        median_test = X_test.median()
        X_train = X_train.fillna(median_train)
        X_test = X_test.fillna(median_test)
        
        X_trains.append(X_train)
        y_trains.append(y_train)
        X_tests.append(X_test)
        y_tests.append(y_test)

        print(f'Data for {partition} processed.')

    return X_trains, y_trains, X_tests, y_tests
    #print(train_datasets[2].shape)


In [3]:
partitions = ["data_1970_80", "data_1981_95", "data_2013_14", "data_2015_17"]
#partitions = ["data_1970_80"]
train_paths = []
test_paths = []
val_paths = []
paths = []
for partition in partitions:
    #train_paths.append(f'../original/{partition}/down_sampled/train/{partition}.csv')
    #test_paths.append(f'../original/{partition}/down_sampled/test/{partition}.csv')
    #val_paths.append(f'../original/{partition}/down_sampled/val/{partition}.csv')
    paths.append(f'../test/{partition}/down_sampled/new_downsampled_{partition}.csv')


X_trains, y_trains, X_tests, y_tests = read_and_split_data(partitions, paths)

ITERATION:  0
Data for data_1970_80 processed.
ITERATION:  1
Data for data_1981_95 processed.
ITERATION:  2
Data for data_2013_14 processed.
ITERATION:  3
Data for data_2015_17 processed.


### Get the label in integers

In [4]:
def manipulate_data(ytrains, ytests):
    
    # Factorize the current column in the training data
    codes, uniques = pd.factorize(ytrains)
    ytrains = codes

    # Create a mapping from string values to their corresponding codes for the training data
    mapping = {value: code for code, value in enumerate(uniques)}

    # Factorize the current column in the training data
    codes, uniques = pd.factorize(ytests)
    ytests = codes

    # Create a mapping from string values to their corresponding codes for the training data
    mapping = {value: code for code, value in enumerate(uniques)}
    
    return ytrains, ytests


for i, partition in enumerate(y_trains):
    y_trains[i], y_tests[i] = manipulate_data(partition, y_tests[i])



## XGBoost

In [5]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

def find_best_xgboost(input, truth):
    n_estimators = [5, 10, 20, 50, 100, 150, 200, 300, 500] #[int(x) for x in np.linspace(start=10, stop=2000, num=10)]
    learning_rate = [0.0001, 0.001, 0.01, 0.1]
    subsample = [0.5, 0.7, 1.0]
    max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    param_grid_gb = {
        "n_estimators": n_estimators,
        "learning_rate": learning_rate,
        "subsample": subsample,
        "max_depth": max_depth,
    }



    gbc = GradientBoostingClassifier(random_state=42)

    rs_gb = RandomizedSearchCV(
        estimator=gbc,
        param_distributions=param_grid_gb,
        scoring=None,
        refit='f1',
        n_iter=10,
        return_train_score=True,
        cv=None,
        n_jobs=-1,
        verbose=1
    )

    # Fit
    gb_train = rs_gb.fit(input, truth)

    best_gb = rs_gb.best_estimator_
    best_gb_index = rs_gb.best_index_
    print("Best params: ", best_gb)
    return best_gb

In [6]:
def predict_accuracy(best_gb, year, X, y):
    y_pred_gbc = best_gb.predict(X)
    accuracy_gbc = accuracy_score(y, y_pred_gbc)
    print(f"Accuracy: {accuracy_gbc * 100:.2f}% for year {year}")
    return accuracy_gbc

In [7]:
best_gb_models = []
test_accs = []

for i, year in enumerate(partitions):
    best_gb_model = find_best_xgboost(X_trains[i], y_trains[i])
    best_gb_models.append(best_gb_model)
    
    test_acc = predict_accuracy(best_gb_model, year, X_tests[i], y_tests[i])
    test_accs.append(test_acc)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params:  GradientBoostingClassifier(learning_rate=0.01, max_depth=9, n_estimators=500,
                           random_state=42, subsample=0.7)
Accuracy: 4.04% for year data_1970_80
Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def find_best_rf(X_train, y_train):
     params = {
          'criterion': ["gini", "entropy"],
          'n_estimators': [5, 10, 20, 50, 100, 150, 200, 300, 500],
          'max_depth': [1, 2,3, 4,5,6,7,8,9,10,11, 12],
          'max_features': ['sqrt', 'log2']
          }

     rf_classifier = RandomForestClassifier(random_state=42)

     grid_search =GridSearchCV(estimator=rf_classifier, param_grid=params, cv=None)

     grid_search.fit(X_train, y_train)
     best_dt = grid_search.best_estimator_
     return best_dt

In [None]:
def predict_rf(best_dt, X_test, y_test):
    y_pred_rf = best_dt.predict(X_test)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    print(f"Accuracy: {acc_rf * 100:.2f}%")
    return acc_rf

In [None]:
best_rf_models = []
test_accs_rf = []

for i, year in enumerate(partitions):
    best_rf_model = find_best_rf(X_trains[i], y_trains[i])
    best_rf_models.append(best_rf_model)
    
    test_acc_rf = predict_accuracy(best_rf_model, year, X_tests[i], y_tests[i])
    test_accs_rf.append(test_acc_rf)

In [None]:
working_path = "../down_sampled/new_downsampled_data_1970_80_1.csv"
hej = pd.read_csv(working_path, encoding='ISO-8859-1')
print(hej.columns)

In [None]:
working_path = "../test/data_1970_80/down_sampled/new_downsampled_data_1970_80.csv"
t_data = pd.read_csv(working_path, encoding='ISO-8859-1')
print(t_data.columns)

In [None]:
working_path = "../down_sampled/new_downsampled_data_1970_80_1.csv"
t_data = pd.read_csv(working_path, encoding='ISO-8859-1')
t_size = int(np.floor(len(t_data) * 0.8))
t_df_train = t_data.iloc[:t_size]
t_df_test = t_data.iloc[int(np.floor(t_size+ len(t_data) * 0.1)):]

t_df_train, t_df_test = manipulate_data(t_df_train, t_df_test)

t_df_train_y = np.array(t_df_train['enc_group'])
t_df_train_X = t_df_train.drop(columns=["enc_group"])

t_df_test_y =np.array(t_df_test['enc_group'])
t_df_test_X = t_df_test.drop(columns=["enc_group"])

In [None]:
t_best_rf = find_best_rf(t_df_train_X, t_df_train_y)

In [None]:
acccc = predict_accuracy(t_best_rf, "70 to 80", t_df_test_X, t_df_test_y)

## Decision Tree

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Predict on the training and validation data
y_train_pred = dt_classifier.predict(X_train)
y_val_pred = dt_classifier.predict(X_val)
y_test_pred = dt_classifier.predict(X_test)

# Evaluate the model on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
# print("\nTraining Classification Report:\n", classification_report(y_train, y_train_pred))

# Evaluate the model on the validation data
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
# print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))


# Evaluate the model on the test data
val_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {val_accuracy * 100:.2f}%")
# print("\nTest Classification Report:\n", classification_report(y_val, y_val_pred))

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training dataset
# train_df = pd.read_csv('train_dataset.csv')  # Replace with your actual file path for the training data
# Assuming the last column is the target variable for the training dataset
# X_train = train_df.iloc[:, :-1]
# y_train = train_df.iloc[:, -1]

# Load the validation dataset
# val_df = pd.read_csv('validation_dataset.csv')  # Replace with your actual file path for the validation data
# Assuming the last column is the target variable for the validation dataset
# X_val = val_df.iloc[:, :-1]
# y_val = val_df.iloc[:, -1]


# Initialize the Decision Tree Classifier with max_depth to prevent overfitting
dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)

# # Initialize the Decision Tree Classifier with max_depth
# dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)


# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Predict on the training and validation data
y_train_pred = dt_classifier.predict(X_train)
y_val_pred = dt_classifier.predict(X_val)
y_test_pred = dt_classifier.predict(X_test)

# Evaluate the model on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
# print("\nTraining Classification Report:\n", classification_report(y_train, y_train_pred))

# Evaluate the model on the validation data
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
# print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))


# Evaluate the model on the validation data
val_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {val_accuracy * 100:.2f}%")
# print("\nTest Classification Report:\n", classification_report(y_val, y_val_pred))


In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True')
  plt.xlabel('Predicted')
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def count_matches(labels, preds):
    #labels = labels.argmax(axis=1)
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import average_precision_score,accuracy_score
    import pandas as pd
    label = pd.DataFrame(labels)
    pred = pd.DataFrame(preds)
    # map_label = {'negative':1, 'positive':0}
    # label[0] = label[0].apply(lambda x: map_label[x])
    # pred[0] = pred[0].apply(lambda x: map_label[x])
    # print('ROC-AUC', roc_auc_score(label, pred))
    # print('precision_recall_curve', average_precision_score(label, pred))
    from sklearn.metrics import f1_score
    print('macro f1_score', f1_score(labels, preds, average='macro'))
    print('micro f1_score', f1_score(labels, preds, average='micro'))
    print('accuracy', accuracy_score(labels, preds))
    print('f1_score', f1_score(labels, preds, average='weighted'))
    print(classification_report(labels, preds))
    cm = confusion_matrix(labels, preds)
    df_cm = pd.DataFrame(cm)
    show_confusion_matrix(df_cm)
    return sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])
num_matches = count_matches( y_test, y_test_pred)

In [None]:
macro f1_score 0.014814814814814814
micro f1_score 0.07537688442211055
accuracy 0.07537688442211055
f1_score 0.01675041876046901