In [1]:
#Ignore 'Future Warnings'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Set Base for Algorithms

In [2]:
#Import Necessary Packages
from tqdm import tqdm
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [3]:
#Set Psudeorandom Seed
seed = 42

In [4]:
splits = 5
kf = StratifiedKFold(n_splits=splits, random_state=seed, shuffle=True)

In [5]:
pca_target = .8

In [13]:
#Models to be used
models = []
models.append(('LR', LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RTREE', RandomForestClassifier(n_estimators=500, max_depth=2, random_state=seed)))
models.append(('XTREE', xgb.XGBRFClassifier(random_state=seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=seed)))

methods = []
for name, model in models:
    methods.append(name)

# Assessment of Base Specification

In [14]:
# Load dataset
files = []

files.append(('One Year Base','1y_offset.xlsx'))
files.append(('Two Year Base','2y_offset.xlsx'))
files.append(('Three Year Base','3y_offset.xlsx'))
files.append(('Five Year Base','5y_offset.xlsx'))
files.append(('Ten Year Base','10y_offset.xlsx'))

## Base Specification

In [15]:
df_b_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_b_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_b_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [16]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
   
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_b_a[name+' mean'] = algo_a_mean
    df_b_a[name+' std'] = algo_a_std
    df_b_r[name+' mean'] = algo_r_mean
    df_b_r[name+' std'] = algo_r_std
    df_b_f[name+' mean'] = algo_f_mean
    df_b_f[name+' std'] = algo_f_std


  0%|                                                                                            | 0/1 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.70s/it][A


In [19]:
df_b_f

Unnamed: 0,Algorithm,One Year Base mean,One Year Base std
0,RTREE,0.817751,0.007772
1,RTREE,0.830799,0.00358


## Base Specification w/ PCA

In [10]:
df_bp_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bp_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bp_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [11]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
     
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bp_a[name+' mean'] = algo_a_mean
    df_bp_a[name+' std'] = algo_a_std
    df_bp_r[name+' mean'] = algo_r_mean
    df_bp_r[name+' std'] = algo_r_std
    df_bp_f[name+' mean'] = algo_f_mean
    df_bp_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:03<00:00, 24.77s/it]


## Base Specification w/ Y/R/C dummies

In [12]:
df_bd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [13]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
       
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bd_a[name+' mean'] = algo_a_mean
    df_bd_a[name+' std'] = algo_a_std
    df_bd_r[name+' mean'] = algo_r_mean
    df_bd_r[name+' std'] = algo_r_std
    df_bd_f[name+' mean'] = algo_f_mean
    df_bd_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:09<00:00, 97.99s/it]


## Base Specification w/ PCA and Y/R/C dummies

In [14]:
df_bpd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_bpd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_bpd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [15]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_bpd_a[name+' mean'] = algo_a_mean
    df_bpd_a[name+' std'] = algo_a_std
    df_bpd_r[name+' mean'] = algo_r_mean
    df_bpd_r[name+' std'] = algo_r_std
    df_bpd_f[name+' mean'] = algo_f_mean
    df_bpd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:51<00:00, 106.22s/it]


# Assessment of Alternative Specification

In [16]:
# Load dataset
files = []

files.append(('One Year Alt','1a_offset.xlsx'))
files.append(('Two Year Alt','2a_offset.xlsx'))
files.append(('Three Year Alt','3a_offset.xlsx'))
files.append(('Five Year Alt','5a_offset.xlsx'))
files.append(('Ten Year Alt','10a_offset.xlsx'))

## Alternative Specification

In [17]:
df_a_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_a_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_a_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [18]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
 
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_a_a[name+' mean'] = algo_a_mean
    df_a_a[name+' std'] = algo_a_std
    df_a_r[name+' mean'] = algo_r_mean
    df_a_r[name+' std'] = algo_r_std
    df_a_f[name+' mean'] = algo_f_mean
    df_a_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:51<00:00, 34.28s/it]


## Alternative Specification w/ PCA

In [19]:
df_ap_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_ap_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_ap_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [20]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_ap_a[name+' mean'] = algo_a_mean
    df_ap_a[name+' std'] = algo_a_std
    df_ap_r[name+' mean'] = algo_r_mean
    df_ap_r[name+' std'] = algo_r_std
    df_ap_f[name+' mean'] = algo_f_mean
    df_ap_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:12<00:00, 26.46s/it]


## Alternative Specification w/ Y/R/C dummies

In [21]:
df_ad_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_ad_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_ad_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [22]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
    
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_ad_a[name+' mean'] = algo_a_mean
    df_ad_a[name+' std'] = algo_a_std
    df_ad_r[name+' mean'] = algo_r_mean
    df_ad_r[name+' std'] = algo_r_std
    df_ad_f[name+' mean'] = algo_f_mean
    df_ad_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:31<00:00, 102.37s/it]


## Alternative Specification w/ Y/R/C dummies and PCA

In [23]:
df_apd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_apd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_apd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [24]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_apd_a[name+' mean'] = algo_a_mean
    df_apd_a[name+' std'] = algo_a_std
    df_apd_r[name+' mean'] = algo_r_mean
    df_apd_r[name+' std'] = algo_r_std
    df_apd_f[name+' mean'] = algo_f_mean
    df_apd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [09:02<00:00, 108.57s/it]


# Assessment of Conflict Type on Alternative Specification

In [25]:
files = []

files.append(('State Based','con_1.xlsx'))
files.append(('Non State','con_2.xlsx'))
files.append(('One Sided','con_3.xlsx'))

## Con Type Specification

In [26]:
df_c_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_c_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_c_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [27]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
      
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_c_a[name+' mean'] = algo_a_mean
    df_c_a[name+' std'] = algo_a_std
    df_c_r[name+' mean'] = algo_r_mean
    df_c_r[name+' std'] = algo_r_std
    df_c_f[name+' mean'] = algo_f_mean
    df_c_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:04<00:00, 41.40s/it]


## Con Type Specification w/  PCA

In [28]:
df_cp_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cp_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cp_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [29]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])

    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cp_a[name+' mean'] = algo_a_mean
    df_cp_a[name+' std'] = algo_a_std
    df_cp_r[name+' mean'] = algo_r_mean
    df_cp_r[name+' std'] = algo_r_std
    df_cp_f[name+' mean'] = algo_f_mean
    df_cp_f[name+' std'] = algo_f_std

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:38<00:00, 32.77s/it]


## Con Type Specification w/ Y/R/C

In [30]:
df_cd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [31]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cd_a[name+' mean'] = algo_a_mean
    df_cd_a[name+' std'] = algo_a_std
    df_cd_r[name+' mean'] = algo_r_mean
    df_cd_r[name+' std'] = algo_r_std
    df_cd_f[name+' mean'] = algo_f_mean
    df_cd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [05:58<00:00, 119.49s/it]


## Con Type Specification w/ Y/R/C dummies and PCA

In [32]:
df_cpd_a = pd.DataFrame(methods, columns = ['Algorithm'])
df_cpd_r = pd.DataFrame(methods, columns = ['Algorithm'])
df_cpd_f = pd.DataFrame(methods, columns = ['Algorithm'])

In [33]:
for name, file in tqdm(files):
    #print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    algo_a_mean = []
    algo_a_std = []
    algo_r_mean = []
    algo_r_std = []
    algo_f_mean = []
    algo_f_std = []

   
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    pca = PCA(pca_target)
    principalComponents = pca.fit_transform(X)
    
    X = pca.fit_transform(X)
        
    for algo, model in models:   
        accuracy = np.array([])
        recall = np.array([])
        f1 = np.array([])
        for fold, (train_index, test_index) in enumerate(kf.split(X, Y), 1):
            X_train = X[train_index]
            Y_train = Y[train_index]  
            X_test = X[test_index]
            Y_test = Y[test_index]  
            
            #train/test split
            sm = SMOTE()
            X_train_oversampled, Y_train_oversampled = sm.fit_sample(X_train, Y_train)
           
            #fit model
            model = model
            model.fit(X_train_oversampled, Y_train_oversampled)  
            Y_pred = model.predict(X_test)
            
            #generate reports
            acc = accuracy_score(Y_test, Y_pred)
            rec = recall_score(Y_test, Y_pred)
            f=  f1_score(Y_test, Y_pred)
            accuracy = np.append(accuracy, acc)
            recall = np.append(recall, rec)
            f1 = np.append(f1, f)
        a_mean = accuracy.mean().tolist()
        a_std = accuracy.std().tolist()
        r_mean = recall.mean().tolist()
        r_std = accuracy.std().tolist()
        f_mean = f1.mean().tolist()
        f_std = accuracy.std().tolist()
        algo_a_mean.append(a_mean)
        algo_a_std.append(a_std)
        algo_r_mean.append(r_mean)
        algo_r_std.append(r_std)
        algo_f_mean.append(f_mean)
        algo_f_std.append(f_std)
    df_cpd_a[name+' mean'] = algo_a_mean
    df_cpd_a[name+' std'] = algo_a_std
    df_cpd_r[name+' mean'] = algo_r_mean
    df_cpd_r[name+' std'] = algo_r_std
    df_cpd_f[name+' mean'] = algo_f_mean
    df_cpd_f[name+' std'] = algo_f_std

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [06:18<00:00, 126.33s/it]


In [35]:
#save to excel file

df_b_a.to_excel('b_acv.xlsx')
df_b_r.to_excel('b_rcv.xlsx')
df_b_f.to_excel('b_fcv.xlsx')

df_a_a.to_excel('a_acv.xlsx')
df_a_r.to_excel('a_rcv.xlsx')
df_a_f.to_excel('a_fcv.xlsx')

df_c_a.to_excel('c_acv.xlsx')
df_c_r.to_excel('c_rcv.xlsx')
df_c_f.to_excel('c_fcv.xlsx')

df_bp_a.to_excel('bp_acv.xlsx')
df_bp_r.to_excel('bp_rcv.xlsx')
df_bp_f.to_excel('bp_fcv.xlsx')

df_ap_a.to_excel('ap_acv.xlsx')
df_ap_r.to_excel('ap_rcv.xlsx')
df_ap_f.to_excel('ap_fcv.xlsx')

df_cp_a.to_excel('cp_acv.xlsx')
df_cp_r.to_excel('cp_rcv.xlsx')
df_cp_f.to_excel('cp_fcv.xlsx')

df_bd_a.to_excel('bd_acv.xlsx')
df_bd_r.to_excel('bd_rcv.xlsx')
df_bd_f.to_excel('bd_fcv.xlsx')

df_ad_a.to_excel('ad_acv.xlsx')
df_ad_r.to_excel('ad_rcv.xlsx')
df_ad_f.to_excel('ad_fcv.xlsx')

df_cd_a.to_excel('cd_acv.xlsx')
df_cd_r.to_excel('cd_rcv.xlsx')
df_cd_f.to_excel('cd_fcv.xlsx')

df_bpd_a.to_excel('bpd_acv.xlsx')
df_bpd_r.to_excel('bpd_rcv.xlsx')
df_bpd_f.to_excel('bpd_fcv.xlsx')

df_apd_a.to_excel('apd_acv.xlsx')
df_apd_r.to_excel('apd_rcv.xlsx')
df_apd_f.to_excel('apd_fcv.xlsx')

df_cpd_a.to_excel('cpd_acv.xlsx')
df_cpd_r.to_excel('cpd_rcv.xlsx')
df_cpd_f.to_excel('cpd_fcv.xlsx')