In [11]:
#Ignore 'Future Warnings'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [12]:
import os
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [13]:
#Set Psudeorandom Seed
seed = 42

In [14]:
models=[]
models.append(('SVM',SVC(random_state=seed, probability=True)))
models.append(('RTREE', RandomForestClassifier(n_estimators=500, max_depth=2, random_state=seed)))

In [15]:
files = []

files.append(('../data/1a_offset.xlsx','Overall'))
files.append(('../data/con_1.xlsx','State Based'))
files.append(('../data/con_2.xlsx','Nonstate'))
files.append(('../data/con_3.xlsx','One-Sided'))

In [23]:
file2 = '../data/imputed_data.xlsx'
df_pred = pd.read_excel (file2,index_col=0,na_values=['..'])

year_dummies = pd.get_dummies(df_pred.year, prefix='year').iloc[:,1:]
region_dummies = pd.get_dummies(df_pred.region, prefix='region').iloc[:,1:]
country_dummies = pd.get_dummies(df_pred.ccode, prefix='country: ').iloc[:,1:]

In [27]:
file3 = '../data/pred_2018.xlsx'
df_pred = pd.read_excel (file3,index_col=0,na_values=['..'])
df_pred = pd.concat([df_pred, year_dummies, region_dummies, country_dummies], axis = 1)
df_pred=df_pred.dropna()

In [29]:
array2 = df_pred.values
X_pred = array2[:,4:]
X_pred = StandardScaler().fit_transform(X_pred)

In [30]:
results = df_pred[['ccode']].copy()
index = results.index

In [44]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)


target = '../data/results.xlsx'
results.to_excel(target)
print('saved to '+target )

[[836  81]
 [ 34 293]]


[[816 101]
 [ 34 293]]


[[934  69]
 [ 19 222]]


[[871 132]
 [ 15 226]]


[[1041   77]
 [  17  109]]


[[910 208]
 [  7 119]]


[[926  85]
 [ 31 202]]


[[855 156]
 [ 26 207]]


saved to ../data/results.xlsx
