In [1]:
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt
import numpy as np
import pandas_profiling
from pandas_profiling import ProfileReport
%matplotlib inline

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from googletrans import Translator
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.impute import KNNImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import plot_importance

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten
from keras.utils import np_utils
from keras.optimizers import SGD
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(2)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Modeling
- Use imputed and cleaned dataset to test out various binary classification models
- Split into models that can handle categorical and those that need encoding


- Encoding:
    - Logistic Regression
    - Decision Tree Classifier
    - Linear Discriminant
    - Quadratic Discriminant
    - Random Forest
    - Naive Bayes
    - XGBoost
    - Neural Net
- Categorical:
    - CatBoost
    
    

### Encoded Modeling

In [2]:
# Load cleaned data
df = pd.read_pickle('df_impute.pkl')

# Transform ordinal
## Ordinal2
df.ordinal2 = df.ordinal2.replace(0,1)
df['ordinal2']= ss.boxcox(df.ordinal2, -0.1911)

## Ordinal1
df['ordinal1']= ss.boxcox(df.ordinal1, -1)

# Specify categorical columns
cat_cols = [x for x in df.columns if x not in ['ordinal1', 'ordinal2', 'label']]

# Convert categorical to str
for col in cat_cols:
    df[col]=df[col].astype(str)
    
# Prepare for modeling
X = df.drop(columns = ['label'])
y = [int(x) for x in df.label]

# Create dummies
# Get dummies, append to new dataframe
# One hot encoding
X_dummies = X.copy()

for col in cat_cols:
    X_dummies = pd.concat([X_dummies,pd.get_dummies(X_dummies[col], prefix=col)],axis=1)
    
X_dummies = X_dummies.drop(columns=cat_cols)

# Convert y to integer
y = [int(x) for x in y]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.33, random_state=0)

### Encoded, Non Neural Net

In [3]:
### List of models
LR = LogisticRegression()
decision_tree = DecisionTreeClassifier()
LDA = LinearDiscriminantAnalysis()
QDA = QuadraticDiscriminantAnalysis()
random_forest = RandomForestClassifier()
bayes = GaussianNB()
XG = XGBClassifier()

# Models
models = [LR, decision_tree, LDA, QDA, random_forest, bayes, XG]

# Create acc lists
acc = []
recs = []
precs = []

for classifier in models:
    # Instantiate model
    model = classifier
    model.fit(X_train, y_train)
    
    # Get preds
    y_pred = model.predict(X_test)
    
    acc.append(accuracy_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))
    precs.append(precision_score(y_test, y_pred))
    
    # History
    print('{} Trained'.format(classifier))

# Print
#scores_df = pd.DataFrame({'Model':models, 'Accuracies':acc, 'Recalls':recs, 'Precs': precs})
#print(scores_df)

LogisticRegression() Trained
DecisionTreeClassifier() Trained
LinearDiscriminantAnalysis() Trained
QuadraticDiscriminantAnalysis() Trained
RandomForestClassifier() Trained
GaussianNB() Trained
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) Trained


### Neural Network

In [4]:
# Compile model
model = Sequential()
model.add(Dense(32, input_dim=len(X_train.columns), activation='relu', kernel_initializer='normal'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy' , optimizer = 'adam' , metrics=['binary_accuracy'])
model.fit(X_train, y_train, epochs = 10, batch_size = 6)

# Evaluate 
y_pred_ = model.predict(X_test)
y_pred = [np.round(x)[0] for x in list(y_pred_)]

# Scores df append
models.append('Neural Net')
acc.append(accuracy_score(y_test, y_pred))
recs.append(recall_score(y_test, y_pred))
precs.append(precision_score(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Categorical Specific Model

In [5]:
### Train CatBoost

# Convert categorical to str
for col in cat_cols:
    df[col]=df[col].astype(str)

# Prepare for modeling
X = df.drop(columns = ['label'])
y = [int(x) for x in df.label]

# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

#### Train CatBoost
classifier = CatBoostClassifier()
model = classifier
model.fit(X_train, y_train, cat_features=cat_cols)
    
# Get preds
y_pred = model.predict(X_test)
    
acc.append(accuracy_score(y_test, y_pred))
recs.append(recall_score(y_test, y_pred))
precs.append(precision_score(y_test, y_pred))
models.append(classifier)
    
print('{} Trained'.format(classifier))


Learning rate set to 0.059768
0:	learn: 0.5466926	total: 248ms	remaining: 4m 8s
1:	learn: 0.4427143	total: 348ms	remaining: 2m 53s
2:	learn: 0.3694877	total: 385ms	remaining: 2m 8s
3:	learn: 0.3109764	total: 431ms	remaining: 1m 47s
4:	learn: 0.2641189	total: 487ms	remaining: 1m 36s
5:	learn: 0.2184090	total: 600ms	remaining: 1m 39s
6:	learn: 0.1810816	total: 684ms	remaining: 1m 37s
7:	learn: 0.1565864	total: 768ms	remaining: 1m 35s
8:	learn: 0.1370935	total: 816ms	remaining: 1m 29s
9:	learn: 0.1215147	total: 871ms	remaining: 1m 26s
10:	learn: 0.1089872	total: 921ms	remaining: 1m 22s
11:	learn: 0.0967501	total: 1.07s	remaining: 1m 27s
12:	learn: 0.0873203	total: 1.14s	remaining: 1m 26s
13:	learn: 0.0811188	total: 1.19s	remaining: 1m 23s
14:	learn: 0.0741504	total: 1.31s	remaining: 1m 26s
15:	learn: 0.0688330	total: 1.45s	remaining: 1m 29s
16:	learn: 0.0645438	total: 1.61s	remaining: 1m 33s
17:	learn: 0.0613396	total: 1.74s	remaining: 1m 34s
18:	learn: 0.0584729	total: 1.89s	remaining: 1

157:	learn: 0.0391422	total: 25s	remaining: 2m 13s
158:	learn: 0.0391319	total: 25.2s	remaining: 2m 13s
159:	learn: 0.0391146	total: 25.4s	remaining: 2m 13s
160:	learn: 0.0391026	total: 25.6s	remaining: 2m 13s
161:	learn: 0.0390903	total: 25.7s	remaining: 2m 13s
162:	learn: 0.0390770	total: 25.9s	remaining: 2m 12s
163:	learn: 0.0390649	total: 26.1s	remaining: 2m 12s
164:	learn: 0.0390526	total: 26.2s	remaining: 2m 12s
165:	learn: 0.0390346	total: 26.5s	remaining: 2m 13s
166:	learn: 0.0390271	total: 26.7s	remaining: 2m 13s
167:	learn: 0.0390134	total: 27s	remaining: 2m 13s
168:	learn: 0.0390077	total: 27.2s	remaining: 2m 13s
169:	learn: 0.0389934	total: 27.4s	remaining: 2m 13s
170:	learn: 0.0389676	total: 27.6s	remaining: 2m 13s
171:	learn: 0.0389443	total: 27.8s	remaining: 2m 13s
172:	learn: 0.0389231	total: 28s	remaining: 2m 13s
173:	learn: 0.0389129	total: 28.2s	remaining: 2m 13s
174:	learn: 0.0389000	total: 28.4s	remaining: 2m 13s
175:	learn: 0.0388719	total: 28.6s	remaining: 2m 13s

315:	learn: 0.0368335	total: 52.5s	remaining: 1m 53s
316:	learn: 0.0368260	total: 52.7s	remaining: 1m 53s
317:	learn: 0.0368046	total: 52.8s	remaining: 1m 53s
318:	learn: 0.0367838	total: 53s	remaining: 1m 53s
319:	learn: 0.0367732	total: 53.2s	remaining: 1m 53s
320:	learn: 0.0367635	total: 53.4s	remaining: 1m 52s
321:	learn: 0.0367557	total: 53.6s	remaining: 1m 52s
322:	learn: 0.0367467	total: 53.8s	remaining: 1m 52s
323:	learn: 0.0367364	total: 53.9s	remaining: 1m 52s
324:	learn: 0.0367306	total: 54.1s	remaining: 1m 52s
325:	learn: 0.0367204	total: 54.3s	remaining: 1m 52s
326:	learn: 0.0367088	total: 54.5s	remaining: 1m 52s
327:	learn: 0.0366892	total: 54.7s	remaining: 1m 51s
328:	learn: 0.0366795	total: 54.9s	remaining: 1m 51s
329:	learn: 0.0366706	total: 55s	remaining: 1m 51s
330:	learn: 0.0366679	total: 55.2s	remaining: 1m 51s
331:	learn: 0.0366574	total: 55.4s	remaining: 1m 51s
332:	learn: 0.0366534	total: 55.6s	remaining: 1m 51s
333:	learn: 0.0366343	total: 55.8s	remaining: 1m 5

470:	learn: 0.0350635	total: 1m 24s	remaining: 1m 34s
471:	learn: 0.0350516	total: 1m 24s	remaining: 1m 34s
472:	learn: 0.0350210	total: 1m 24s	remaining: 1m 34s
473:	learn: 0.0350109	total: 1m 25s	remaining: 1m 34s
474:	learn: 0.0350062	total: 1m 25s	remaining: 1m 34s
475:	learn: 0.0349908	total: 1m 25s	remaining: 1m 34s
476:	learn: 0.0349758	total: 1m 25s	remaining: 1m 33s
477:	learn: 0.0349735	total: 1m 25s	remaining: 1m 33s
478:	learn: 0.0349592	total: 1m 26s	remaining: 1m 33s
479:	learn: 0.0349493	total: 1m 26s	remaining: 1m 33s
480:	learn: 0.0349379	total: 1m 26s	remaining: 1m 33s
481:	learn: 0.0349107	total: 1m 27s	remaining: 1m 33s
482:	learn: 0.0348883	total: 1m 27s	remaining: 1m 33s
483:	learn: 0.0348824	total: 1m 27s	remaining: 1m 33s
484:	learn: 0.0348624	total: 1m 27s	remaining: 1m 33s
485:	learn: 0.0348532	total: 1m 27s	remaining: 1m 33s
486:	learn: 0.0348411	total: 1m 28s	remaining: 1m 32s
487:	learn: 0.0348246	total: 1m 28s	remaining: 1m 32s
488:	learn: 0.0348084	total:

623:	learn: 0.0334034	total: 1m 55s	remaining: 1m 9s
624:	learn: 0.0334007	total: 1m 55s	remaining: 1m 9s
625:	learn: 0.0333797	total: 1m 55s	remaining: 1m 8s
626:	learn: 0.0333748	total: 1m 55s	remaining: 1m 8s
627:	learn: 0.0333686	total: 1m 55s	remaining: 1m 8s
628:	learn: 0.0333668	total: 1m 56s	remaining: 1m 8s
629:	learn: 0.0333611	total: 1m 56s	remaining: 1m 8s
630:	learn: 0.0333420	total: 1m 56s	remaining: 1m 8s
631:	learn: 0.0333243	total: 1m 56s	remaining: 1m 7s
632:	learn: 0.0333230	total: 1m 56s	remaining: 1m 7s
633:	learn: 0.0333191	total: 1m 57s	remaining: 1m 7s
634:	learn: 0.0333052	total: 1m 57s	remaining: 1m 7s
635:	learn: 0.0333028	total: 1m 57s	remaining: 1m 7s
636:	learn: 0.0332798	total: 1m 57s	remaining: 1m 7s
637:	learn: 0.0332624	total: 1m 57s	remaining: 1m 6s
638:	learn: 0.0332614	total: 1m 58s	remaining: 1m 6s
639:	learn: 0.0332564	total: 1m 58s	remaining: 1m 6s
640:	learn: 0.0332470	total: 1m 58s	remaining: 1m 6s
641:	learn: 0.0332448	total: 1m 58s	remaining:

780:	learn: 0.0319715	total: 2m 25s	remaining: 40.8s
781:	learn: 0.0319598	total: 2m 25s	remaining: 40.6s
782:	learn: 0.0319513	total: 2m 25s	remaining: 40.4s
783:	learn: 0.0319471	total: 2m 26s	remaining: 40.3s
784:	learn: 0.0319290	total: 2m 26s	remaining: 40.1s
785:	learn: 0.0319161	total: 2m 26s	remaining: 40s
786:	learn: 0.0319090	total: 2m 27s	remaining: 39.8s
787:	learn: 0.0318998	total: 2m 27s	remaining: 39.6s
788:	learn: 0.0318778	total: 2m 27s	remaining: 39.4s
789:	learn: 0.0318661	total: 2m 27s	remaining: 39.3s
790:	learn: 0.0318587	total: 2m 28s	remaining: 39.1s
791:	learn: 0.0318449	total: 2m 28s	remaining: 38.9s
792:	learn: 0.0318305	total: 2m 28s	remaining: 38.7s
793:	learn: 0.0318275	total: 2m 28s	remaining: 38.6s
794:	learn: 0.0318178	total: 2m 28s	remaining: 38.4s
795:	learn: 0.0318000	total: 2m 29s	remaining: 38.2s
796:	learn: 0.0317955	total: 2m 29s	remaining: 38s
797:	learn: 0.0317837	total: 2m 29s	remaining: 37.8s
798:	learn: 0.0317739	total: 2m 29s	remaining: 37.

936:	learn: 0.0307396	total: 3m 5s	remaining: 12.4s
937:	learn: 0.0307350	total: 3m 5s	remaining: 12.3s
938:	learn: 0.0307345	total: 3m 5s	remaining: 12.1s
939:	learn: 0.0307297	total: 3m 6s	remaining: 11.9s
940:	learn: 0.0307250	total: 3m 6s	remaining: 11.7s
941:	learn: 0.0307209	total: 3m 6s	remaining: 11.5s
942:	learn: 0.0307147	total: 3m 7s	remaining: 11.3s
943:	learn: 0.0307131	total: 3m 7s	remaining: 11.1s
944:	learn: 0.0307052	total: 3m 7s	remaining: 10.9s
945:	learn: 0.0306775	total: 3m 7s	remaining: 10.7s
946:	learn: 0.0306599	total: 3m 8s	remaining: 10.5s
947:	learn: 0.0306537	total: 3m 8s	remaining: 10.3s
948:	learn: 0.0306370	total: 3m 8s	remaining: 10.1s
949:	learn: 0.0306330	total: 3m 8s	remaining: 9.94s
950:	learn: 0.0306266	total: 3m 9s	remaining: 9.74s
951:	learn: 0.0306160	total: 3m 9s	remaining: 9.54s
952:	learn: 0.0305970	total: 3m 9s	remaining: 9.34s
953:	learn: 0.0305919	total: 3m 9s	remaining: 9.14s
954:	learn: 0.0305888	total: 3m 9s	remaining: 8.94s
955:	learn: 

In [6]:
## Review Scores
scores_df = pd.DataFrame({'Model':models, 'Accuracies':acc, 'Recalls':recs, 'Precs': precs})
scores_df

Unnamed: 0,Model,Accuracies,Recalls,Precs
0,LogisticRegression(),0.989947,0.176101,0.571429
1,DecisionTreeClassifier(),0.982705,0.210692,0.19764
2,LinearDiscriminantAnalysis(),0.986607,0.349057,0.359223
3,QuadraticDiscriminantAnalysis(),0.213029,0.902516,0.011931
4,"(DecisionTreeClassifier(max_features='auto', r...",0.989914,0.154088,0.576471
5,GaussianNB(),0.181283,0.91195,0.011591
6,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.990146,0.22327,0.581967
7,Neural Net,0.989517,0.122642,0.506494
8,<catboost.core.CatBoostClassifier object at 0x...,0.988856,0.174286,0.559633


Given the various baseline results, I will proceed with XGBoost, Neural Networks, and CatBoost. Though the recall and precision values are lower for XGBoost and Neural Networks, there is a large amount of tuning that can be done. 