In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer

In [2]:
trains = ['trainp1', 'trainp2', 'trainp3', 'trainp4']
tests = ['testp1', 'testp2', 'testp3', 'testp4']
dir = '../CleanPartitions/'

traindatas = []
testdatas = []

for i, partition in enumerate(trains):
    trainpath = f'{dir}{trains[i]}.csv'
    testpath = f'{dir}{tests[i]}.csv'

    traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
    testdata = pd.read_csv(testpath, encoding='ISO-8859-1')
    
    if 'attack_date' in traindata.columns:
        traindata = traindata.drop(columns=['attack_date'])

    if 'attack_date' in testdata.columns:
        testdata = testdata.drop(columns=['attack_date'])


    traindatas.append(traindata)
    testdatas.append(testdata)

    print(f'shape {trains[i]}', traindata.shape)
    print(f'shape {tests[i]}', testdata.shape)
#trainpath = '../CleanPartitions/trainp1.csv'
#testpath = '../CleanPartitions/testp1.csv'
#traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
#testdata = pd.read_csv(testpath, encoding='ISO-8859-1')
#print(traindata.shape)
#testdata.shape

shape trainp1 (783, 25)
shape testp1 (338, 25)
shape trainp2 (6647, 25)
shape testp2 (2847, 25)
shape trainp3 (1061, 25)
shape testp3 (452, 25)
shape trainp4 (4585, 25)
shape testp4 (1964, 25)


In [3]:
traindatas[1].columns

Index(['iyear', 'imonth', 'iday', 'extended', 'country', 'region', 'provstate',
       'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'multiple',
       'success', 'suicide', 'attacktype1', 'targtype1', 'target1', 'natlty1',
       'gname', 'individual', 'weaptype1', 'nkill', 'property', 'ishostkid'],
      dtype='object')

In [4]:
#traindata['gname'].value_counts()

In [5]:
#testdata['gname'].value_counts()

In [6]:
def split_data(dftrain, dftest):
    Ytrain = dftrain['gname']
    Xtrain = dftrain.drop(columns=['gname'])
    Ytest = dftest['gname']
    Xtest = dftest.drop(columns=['gname'])
    return Xtrain, Ytrain, Xtest, Ytest

def find_best_rfc(Xtrain, Ytrain):
     n_estimators = [5, 10, 20, 50, 100, 150, 200, 300, 500] #[int(x) for x in np.linspace(start=10, stop=2000, num=10)]
     learning_rate = [0.0001, 0.001, 0.01, 0.1]
     subsample = [0.5, 0.7, 1.0]
     max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

     params = {
          'criterion': ["gini", "entropy"],
          'n_estimators': [200, 300, 500],
          'max_depth': [2, 5, 10, 15, 20],
          'max_features': ['sqrt', 'log2']
          }

     rfc = RandomForestClassifier(random_state=42)

     tscv = TimeSeriesSplit(n_splits=5)


     grid_search =GridSearchCV(estimator=rfc, param_grid=params, cv = tscv)

     grid_search.fit(Xtrain, Ytrain)
     best_rfc = grid_search.best_estimator_
     #print(best_dt)
     return best_rfc

In [7]:
#y_pred_rfc = best_dt.predict(Xtestp1)
#accuracy_gbc = accuracy_score(Ytestp1, y_pred_rfc)
#print(f"Accuracy: {accuracy_gbc * 100:.2f}%")

In [8]:
best_rfcs = []
truths = []
Xtests = []
for i in range(len(traindatas)):
    Xtrain, Ytrain, Xtest, Ytest = split_data(traindatas[i], testdatas[i])
    print(f'Finding best rfc for parition {i+1}')
    best_rfc = find_best_rfc(Xtrain, Ytrain)
    print('---------------------------------')
    best_rfcs.append(best_rfc)
    truths.append(Ytest)
    Xtests.append(Xtest)

Finding best rfc for parition 1
---------------------------------
Finding best rfc for parition 2
---------------------------------
Finding best rfc for parition 3
---------------------------------
Finding best rfc for parition 4
---------------------------------


In [9]:
accuracies = []

for i in range(len(Xtests)):
    print(f'partition {i+1}:')
    y_pred_rfc = best_rfcs[i].predict(Xtests[i])
    accuracy_rfc = accuracy_score(truths[i], y_pred_rfc)
    accuracies.append(accuracy_rfc)
    print(f"Accuracy: {accuracy_rfc * 100:.2f}%")
    print('-------------------------------------------------')

partition 1:
Accuracy: 72.78%
-------------------------------------------------
partition 2:
Accuracy: 99.82%
-------------------------------------------------
partition 3:
Accuracy: 100.00%
-------------------------------------------------
partition 4:
Accuracy: 100.00%
-------------------------------------------------


In [10]:
#confusion_matrix(Ytestp1, y_pred_rfc)

In [11]:
#classification_report(Ytestp1, y_pred_rfc)