In [1]:
#Ignore 'Future Warnings'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import os
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [3]:
#Set Psudeorandom Seed
seed = 42
pca_target = .8

In [4]:
models=[]
models.append(('SVM',SVC(random_state=seed, probability=True)))
models.append(('XTREE', xgb(random_state=seed)))

In [5]:
files = []

files.append(('../data/1a_offset.xlsx','Overall'))
files.append(('../data/con_1.xlsx','State Based'))
files.append(('../data/con_2.xlsx','Nonstate'))
files.append(('../data/con_3.xlsx','One-Sided'))

In [6]:
spec = '../data/a_spec.xlsx'
df_spec = pd.read_excel (spec,index_col=0,na_values=['..'])
df_spec = df_spec.rename(columns={df_spec.columns[3]: 'predictor'})


year_dummies = pd.get_dummies(df_spec.year, prefix='year').iloc[:,1:]
region_dummies = pd.get_dummies(df_spec.region, prefix='region').iloc[:,1:]
country_dummies = pd.get_dummies(df_spec.ccode, prefix='country: ').iloc[:,1:]

df_spec = pd.concat([df_spec,year_dummies, region_dummies, country_dummies], axis = 1)

In [7]:
file3 = '../data/pred_2018.xlsx'
df_pred = pd.read_excel (file3,index_col=0,na_values=['..'])
df_pred = pd.concat([df_pred,year_dummies, region_dummies, country_dummies], axis = 1)
df_pred=df_pred.dropna()
df_pred = df_pred.rename(columns={df_pred.columns[3]: 'predictor'})

In [8]:
array2 = df_pred.values
X_pred = array2[:,4:]
X_pred = StandardScaler().fit_transform(X_pred)

In [9]:
results = df_pred[['ccode']].copy()
index = results.index

In [10]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    df = df.rename(columns={df.columns[3]: 'predictor'})
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0
    df=df[df_spec.columns]

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int') 
    
    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)

[[846  68]
 [ 35 281]]


[[870  44]
 [ 61 255]]


[[930  68]
 [ 23 209]]


[[956  42]
 [ 33 199]]


[[1032   67]
 [  15  116]]


[[1026   73]
 [  25  106]]


[[916  84]
 [ 21 209]]


[[939  61]
 [ 39 191]]




In [11]:
files = []

files.append(('../data/2a_offset.xlsx','Two Year Offset'))


In [12]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    df = df.rename(columns={df.columns[3]: 'predictor'})
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0
    df['year_2017'] = 0
    df=df[df_spec.columns]

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int') 
    
    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)


[[812  74]
 [ 23 278]]


[[829  57]
 [ 48 253]]




In [13]:
files = []

files.append(('../data/3a_offset.xlsx','Three Year Offset'))


In [14]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    df = df.rename(columns={df.columns[3]: 'predictor'})
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0
    df['year_2017'] = 0
    df['year_2016'] = 0
    df=df[df_spec.columns]

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int') 
    
    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)


[[750  87]
 [ 35 271]]


[[776  61]
 [ 61 245]]




In [15]:
files = []

files.append(('../data/5a_offset.xlsx','Five Year Offset'))


In [16]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    df = df.rename(columns={df.columns[3]: 'predictor'})
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0
    df['year_2017'] = 0
    df['year_2016'] = 0
    df['year_2015'] = 0
    df['year_2014'] = 0
    df=df[df_spec.columns]

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int') 
    
    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)


[[722  85]
 [ 27 223]]


[[737  70]
 [ 41 209]]




In [17]:
files = []

files.append(('../data/10a_offset.xlsx','Ten Year Offset'))


In [18]:
for file, name in files:
    
    X=[]
    Y=[]
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    
    #Creation of Year/Region/Country Dummies
    year_dummies = pd.get_dummies(df.year, prefix='year').iloc[:,1:]
    region_dummies = pd.get_dummies(df.region, prefix='region').iloc[:,1:]
    country_dummies = pd.get_dummies(df.ccode, prefix='country: ').iloc[:,1:]
    
    df = pd.concat([df, year_dummies, region_dummies, country_dummies], axis = 1)
    df = df.rename(columns={df.columns[3]: 'predictor'})
    
    #2018 Column added to ensure equal number of columns in training and test data. 
    #Since all 2018 values were dropped for model evaluation, all 2018 values for training data are zero.
    df['year_2018'] = 0
    df['year_2017'] = 0
    df['year_2016'] = 0
    df['year_2015'] = 0
    df['year_2014'] = 0
    df['year_2013'] = 0
    df['year_2012'] = 0
    df['year_2011'] = 0
    df['year_2010'] = 0
    df['year_2009'] = 0
    df['country: _SSD'] = 0
    df=df[df_spec.columns]

    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int') 
    
    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=seed)
    
    #Impliment SMOTE
    sm = SMOTE(random_state = seed) 
    X_train, Y_train = sm.fit_sample(X_train, Y_train)

    # Make predictions on validation dataset
    for algo, model in models:
        algo_model=model
        algo_model.fit(X_train, Y_train)
        predictions = algo_model.predict(X_validation)
        print(confusion_matrix(Y_validation, predictions))
        print('')
        print('')
        pred = algo_model.predict_proba(X_pred)

        df_pred = pd.DataFrame({str(algo+' No '+name):pred[:,0],str(algo+' '+name): pred[:,1]}, index=index)
        results = pd.concat([results, df_pred], axis=1)


[[591  58]
 [ 19 173]]


[[589  60]
 [ 32 160]]




In [19]:
target = '../data/results.xlsx'
results.to_excel(target)
print('saved to '+target )

saved to ../data/results.xlsx
