In [1]:
#Ignore 'Future Warnings'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Set Base for Algorithms

In [2]:
#Import Necessary Packages
from tqdm import tqdm
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [3]:
#Set Psudeorandom Seed
seed = 42

In [4]:
splits = 5
kf = StratifiedKFold(n_splits=splits, random_state=seed, shuffle=True)

In [5]:
pca_target = .8

In [6]:
#Models to be used
models = []
models.append(('LR', LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RTREE', RandomForestClassifier(n_estimators=500, max_depth=2, random_state=seed)))
models.append(('XTREE', xgb.XGBRFClassifier(random_state=seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=seed)))

methods = []
for name, model in models:
    methods.append(name)

# Naive Assessments

In [7]:
df = pd.read_excel ('../data/1y_offset.xlsx',index_col=0,na_values=['..'])

In [8]:
df_naive=df.loc[:,{'con1_next','total_ev'}]
df_naive['simple'] = 0
df_naive['con_in'] = [1 if x > 0 else 0 for x in df['total_ev']]

In [9]:
def true_pos_s(s):
    if (s['con1_next']==1) and (s['simple']==1):
        return 1
    else:
        return 0

def true_neg_s(s):
    if (s['con1_next']==0) and (s['simple']==0):
        return 1
    else:
        return 0

def fal_pos_s(s):
    if (s['con1_next']==0) and (s['simple']==1):
        return 1
    else:
        return 0
    
def fal_neg_s(s):
    if (s['con1_next']==1) and (s['simple']==0):
        return 1
    else:
        return 0
    
df_naive['s_tp'] = df_naive.apply(true_pos_s, axis=1)
df_naive['s_tn'] = df_naive.apply(true_neg_s, axis=1)
df_naive['s_fp'] = df_naive.apply(fal_pos_s, axis=1)
df_naive['s_fn'] = df_naive.apply(fal_neg_s, axis=1)

def true_pos_l(s):
    if (s['con1_next']==1) and (s['con_in']==1):
        return 1
    else:
        return 0

def true_neg_l(s):
    if (s['con1_next']==0) and (s['con_in']==0):
        return 1
    else:
        return 0

def fal_pos_l(s):
    if (s['con1_next']==0) and (s['con_in']==1):
        return 1
    else:
        return 0
    
def fal_neg_l(s):
    if (s['con1_next']==1) and (s['con_in']==0):
        return 1
    else:
        return 0
    
df_naive['l_tp'] = df_naive.apply(true_pos_l, axis=1)
df_naive['l_tn'] = df_naive.apply(true_neg_l, axis=1)
df_naive['l_fp'] = df_naive.apply(fal_pos_l, axis=1)
df_naive['l_fn'] = df_naive.apply(fal_neg_l, axis=1)

tot_obs=len(df_naive.index)

tot_s_fp=sum(df_naive['s_fp'])
tot_s_tp=sum(df_naive['s_tp'])
tot_s_fn=sum(df_naive['s_fn'])
tot_s_tn=sum(df_naive['s_tn'])

tot_l_fp=sum(df_naive['l_fp'])
tot_l_tp=sum(df_naive['l_tp'])
tot_l_fn=sum(df_naive['l_fn'])
tot_l_tn=sum(df_naive['l_tn'])

## Functions for determining performance
### Accuracy = TruePositives + TrueNegatives / Total Obsersvations
### Precision = TruePositives / (TruePositives + FalsePositives)
### Recall = TruePositives / (TruePositives + FalseNegatives)
### F-Measure = (2 * Precision * Recall) / (Precision + Recall)

In [10]:
accuracy_l = (tot_l_tp + tot_l_tn)/tot_obs
precision_l = tot_l_tp/(tot_l_tp+tot_l_fp)
recall_l = tot_l_tp/(tot_l_tp+tot_l_fn)
f1_l=(2*precision_l*recall_l)/(precision_l+recall_l)

print('recall score is '+str(round(recall_l,2)))
print('accuracy score is '+str(round(accuracy_l,2)))
print('f1 score is '+str(round(f1_l,2)))

recall score is 0.83
accuracy score is 0.92
f1 score is 0.83


# Determining the accuracy of the simple classification.
### Note that since there are no true positives in this case, precision and recall are zero and f1 is undefined.

In [11]:
accuracy_s = (tot_s_tp + tot_l_tn)/tot_obs
precision_s = tot_s_tp/(tot_l_tp+tot_s_fp)
recall_s = tot_s_tp/(tot_l_tp+tot_s_fn)
#f1_s=(2*precision_s*recall_s)/(precision_s+recall_s)

print('recall score is '+str(round(recall_s,2)))
print('accuracy score is '+str(round(accuracy_s,2)))
#print('f1 score is '+str(round(f1_l,2)))

recall score is 0.0
accuracy score is 0.71


# Assessment of Base Specification

In [12]:
# Load dataset
files = []

files.append(('One Year Base','../data/1y_offset.xlsx'))
files.append(('Two Year Base','../data/2y_offset.xlsx'))
files.append(('Three Year Base','../data/3y_offset.xlsx'))
files.append(('Five Year Base','../data/5y_offset.xlsx'))
files.append(('Ten Year Base','../data/10y_offset.xlsx'))

## Base Specification

In [13]:
df_b_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_b_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_b_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [14]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
   
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_b_a[name+' mean'] = algo_a_mean
    df_b_a[name+' std'] = algo_a_std
    df_b_r[name+' mean'] = algo_r_mean
    df_b_r[name+' std'] = algo_r_std
    df_b_f[name+' mean'] = algo_f_mean
    df_b_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:04<00:00, 36.89s/it]


In [15]:
df_b_f

Unnamed: 0,Algorithm,One Year Base mean,One Year Base std,Two Year Base mean,Two Year Base std,Three Year Base mean,Three Year Base std,Five Year Base mean,Five Year Base std,Ten Year Base mean,Ten Year Base std
0,LR,0.829998,0.004501,0.803826,0.011439,0.779846,0.005903,0.755183,0.007909,0.731935,0.010281
1,LDA,0.833729,0.003522,0.80747,0.009284,0.785548,0.004734,0.758843,0.00617,0.737609,0.007546
2,KNN,0.81296,0.01233,0.803595,0.009146,0.796016,0.00665,0.795395,0.004981,0.764842,0.008754
3,CART,0.776717,0.010499,0.764761,0.007539,0.768884,0.003811,0.758006,0.008191,0.758042,0.004554
4,RTREE,0.830046,0.006542,0.803825,0.007218,0.781994,0.006673,0.761804,0.003685,0.740143,0.009463
5,XTREE,0.832614,0.003255,0.810425,0.007828,0.77643,0.009525,0.75559,0.007335,0.745996,0.010966
6,NB,0.79083,0.003013,0.765357,0.008503,0.74056,0.007622,0.725309,0.007686,0.691712,0.016707
7,SVM,0.82559,0.004267,0.819229,0.01027,0.799904,0.007777,0.80211,0.008109,0.772665,0.005842


## Base Specification w/ PCA

In [16]:
df_bp_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bp_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bp_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [17]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
     
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bp_a[name+' mean'] = algo_a_mean
    df_bp_a[name+' std'] = algo_a_std
    df_bp_r[name+' mean'] = algo_r_mean
    df_bp_r[name+' std'] = algo_r_std
    df_bp_f[name+' mean'] = algo_f_mean
    df_bp_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:24<00:00, 28.86s/it]


## Base Specification w/ Y/R/C dummies

In [18]:
df_bd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [19]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
       
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bd_a[name+' mean'] = algo_a_mean
    df_bd_a[name+' std'] = algo_a_std
    df_bd_r[name+' mean'] = algo_r_mean
    df_bd_r[name+' std'] = algo_r_std
    df_bd_f[name+' mean'] = algo_f_mean
    df_bd_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:56<00:00, 95.23s/it]


## Base Specification w/ PCA and Y/R/C dummies

In [20]:
df_bpd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bpd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bpd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [21]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bpd_a[name+' mean'] = algo_a_mean
    df_bpd_a[name+' std'] = algo_a_std
    df_bpd_r[name+' mean'] = algo_r_mean
    df_bpd_r[name+' std'] = algo_r_std
    df_bpd_f[name+' mean'] = algo_f_mean
    df_bpd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:52<00:00, 106.43s/it]


# Assessment of Alternative Specification

In [22]:
# Load dataset
files = []

files.append(('One Year Alt','../data/1a_offset.xlsx'))
files.append(('Two Year Alt','../data/2a_offset.xlsx'))
files.append(('Three Year Alt','../data/3a_offset.xlsx'))
files.append(('Five Year Alt','../data/5a_offset.xlsx'))
files.append(('Ten Year Alt','../data/10a_offset.xlsx'))

## Alternative Specification

In [23]:
df_a_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_a_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_a_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [24]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
 
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_a_a[name+' mean'] = algo_a_mean
    df_a_a[name+' std'] = algo_a_std
    df_a_r[name+' mean'] = algo_r_mean
    df_a_r[name+' std'] = algo_r_std
    df_a_f[name+' mean'] = algo_f_mean
    df_a_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:49<00:00, 33.86s/it]


## Alternative Specification w/ PCA

In [25]:
df_ap_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_ap_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_ap_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [26]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_ap_a[name+' mean'] = algo_a_mean
    df_ap_a[name+' std'] = algo_a_std
    df_ap_r[name+' mean'] = algo_r_mean
    df_ap_r[name+' std'] = algo_r_std
    df_ap_f[name+' mean'] = algo_f_mean
    df_ap_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:10<00:00, 26.11s/it]


## Alternative Specification w/ Y/R/C dummies

In [27]:
df_ad_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_ad_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_ad_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [28]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
    
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_ad_a[name+' mean'] = algo_a_mean
    df_ad_a[name+' std'] = algo_a_std
    df_ad_r[name+' mean'] = algo_r_mean
    df_ad_r[name+' std'] = algo_r_std
    df_ad_f[name+' mean'] = algo_f_mean
    df_ad_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:09<00:00, 97.97s/it]


## Alternative Specification w/ Y/R/C dummies and PCA

In [29]:
df_apd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_apd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_apd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [30]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_apd_a[name+' mean'] = algo_a_mean
    df_apd_a[name+' std'] = algo_a_std
    df_apd_r[name+' mean'] = algo_r_mean
    df_apd_r[name+' std'] = algo_r_std
    df_apd_f[name+' mean'] = algo_f_mean
    df_apd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:49<00:00, 105.88s/it]


# Assessment of Conflict Type on Alternative Specification

In [31]:
files = []

files.append(('State Based','../data/con_1.xlsx'))
files.append(('Non State','../data/con_2.xlsx'))
files.append(('One Sided','../data/con_3.xlsx'))

## Con Type Specification

In [32]:
df_c_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_c_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_c_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [33]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
      
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_c_a[name+' mean'] = algo_a_mean
    df_c_a[name+' std'] = algo_a_std
    df_c_r[name+' mean'] = algo_r_mean
    df_c_r[name+' std'] = algo_r_std
    df_c_f[name+' mean'] = algo_f_mean
    df_c_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:57<00:00, 39.26s/it]


## Con Type Specification w/  PCA

In [34]:
df_cp_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cp_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cp_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [35]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cp_a[name+' mean'] = algo_a_mean
    df_cp_a[name+' std'] = algo_a_std
    df_cp_r[name+' mean'] = algo_r_mean
    df_cp_r[name+' std'] = algo_r_std
    df_cp_f[name+' mean'] = algo_f_mean
    df_cp_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:32<00:00, 30.89s/it]


## Con Type Specification w/ Y/R/C

In [36]:
df_cd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [37]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cd_a[name+' mean'] = algo_a_mean
    df_cd_a[name+' std'] = algo_a_std
    df_cd_r[name+' mean'] = algo_r_mean
    df_cd_r[name+' std'] = algo_r_std
    df_cd_f[name+' mean'] = algo_f_mean
    df_cd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [05:41<00:00, 113.77s/it]


## Con Type Specification w/ Y/R/C dummies and PCA

In [38]:
df_cpd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cpd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cpd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [39]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cpd_a[name+' mean'] = algo_a_mean
    df_cpd_a[name+' std'] = algo_a_std
    df_cpd_r[name+' mean'] = algo_r_mean
    df_cpd_r[name+' std'] = algo_r_std
    df_cpd_f[name+' mean'] = algo_f_mean
    df_cpd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [06:25<00:00, 128.41s/it]


In [41]:
#save to excel file

df_b_a.to_excel('../data/b_acv.xlsx')
df_b_r.to_excel('../data/b_rcv.xlsx')
df_b_f.to_excel('../data/b_fcv.xlsx')

df_a_a.to_excel('../data/a_acv.xlsx')
df_a_r.to_excel('../data/a_rcv.xlsx')
df_a_f.to_excel('../data/a_fcv.xlsx')

df_c_a.to_excel('../data/c_acv.xlsx')
df_c_r.to_excel('../data/c_rcv.xlsx')
df_c_f.to_excel('../data/c_fcv.xlsx')

df_bp_a.to_excel('../data/bp_acv.xlsx')
df_bp_r.to_excel('../data/bp_rcv.xlsx')
df_bp_f.to_excel('../data/bp_fcv.xlsx')

df_ap_a.to_excel('../data/ap_acv.xlsx')
df_ap_r.to_excel('../data/ap_rcv.xlsx')
df_ap_f.to_excel('../data/ap_fcv.xlsx')

df_cp_a.to_excel('../data/cp_acv.xlsx')
df_cp_r.to_excel('../data/cp_rcv.xlsx')
df_cp_f.to_excel('../data/cp_fcv.xlsx')

df_bd_a.to_excel('../data/bd_acv.xlsx')
df_bd_r.to_excel('../data/bd_rcv.xlsx')
df_bd_f.to_excel('../data/bd_fcv.xlsx')

df_ad_a.to_excel('../data/ad_acv.xlsx')
df_ad_r.to_excel('../data/ad_rcv.xlsx')
df_ad_f.to_excel('../data/ad_fcv.xlsx')

df_cd_a.to_excel('../data/cd_acv.xlsx')
df_cd_r.to_excel('../data/cd_rcv.xlsx')
df_cd_f.to_excel('../data/cd_fcv.xlsx')

df_bpd_a.to_excel('../data/bpd_acv.xlsx')
df_bpd_r.to_excel('../data/bpd_rcv.xlsx')
df_bpd_f.to_excel('../data/bpd_fcv.xlsx')

df_apd_a.to_excel('../data/apd_acv.xlsx')
df_apd_r.to_excel('../data/apd_rcv.xlsx')
df_apd_f.to_excel('../data/apd_fcv.xlsx')

df_cpd_a.to_excel('../data/cpd_acv.xlsx')
df_cpd_r.to_excel('../data/cpd_rcv.xlsx')
df_cpd_f.to_excel('../data/cpd_fcv.xlsx')