In [1]:
import os
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score



In [2]:
#Set Psudeorandom Seed
seed = 42

In [3]:
# Load dataset
files = []

files.append(('State Based','C:/Users/Brandon/Documents/thesis/data/con_1.xlsx'))
files.append(('Non State','C:/Users/Brandon/Documents/thesis/data/con_2.xlsx'))
files.append(('One Sided','C:/Users/Brandon/Documents/thesis/data/con_3.xlsx'))

models = []
models.append(('LR', LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RTREE', RandomForestClassifier(n_estimators=500, max_depth=2, random_state=seed)))
models.append(('XTREE', xgb.XGBRFClassifier(random_state=seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

In [4]:
for name, file in files:
    print('Fitting for '+name)
    df = pd.read_excel (file,index_col=0,na_values=['..'])
    #print('dataframe created')
    
    algo = []
    recall = []
    f1 = []
    
    #Create testing and training sets
    array = df.values

    #Create X array
    X= array[:,4:45]

    # Standardizing the features
    X = StandardScaler().fit_transform(X)

    #Create Y array
    Y= array[:,3]
    Y=Y.astype('int')

    #Training/Test Split
    X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=1 )
    #print('Train/Test Split Complete')
    
    #Impliment SMOTE
    sm = SMOTE(random_state = 2) 
    X_train_smote, Y_train_smote = sm.fit_sample(X_train, Y_train)
    #print('oversampling complete')
    
    for algo, model in models:
        model.fit(X_train_smote, Y_train_smote)
        predictions = model.predict(X_validation)
        recall.append(round(recall_score(Y_validation, predictions),2))
        f1.append(round(f1_score(Y_validation, predictions),2))
        print('%s: %f / %f' % (algo, round(recall_score(Y_validation, predictions),2), round(f1_score(Y_validation, predictions),2)))
    print('')
    print('')

Fitting for State Based




LR: 0.900000 / 0.760000
LDA: 0.870000 / 0.750000
KNN: 0.910000 / 0.790000
CART: 0.810000 / 0.780000
RTREE: 0.930000 / 0.770000
XTREE: 0.890000 / 0.770000
NB: 0.920000 / 0.610000
SVM: 0.910000 / 0.800000


Fitting for Non State




LR: 0.920000 / 0.620000
LDA: 0.880000 / 0.570000
KNN: 0.870000 / 0.700000
CART: 0.670000 / 0.650000
RTREE: 0.900000 / 0.570000
XTREE: 0.920000 / 0.600000
NB: 0.920000 / 0.490000
SVM: 0.920000 / 0.710000


Fitting for One Sided




LR: 0.870000 / 0.720000
LDA: 0.830000 / 0.690000
KNN: 0.880000 / 0.720000
CART: 0.760000 / 0.720000
RTREE: 0.890000 / 0.700000
XTREE: 0.850000 / 0.700000
NB: 0.850000 / 0.630000
SVM: 0.880000 / 0.750000


