In [1]:
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt
import numpy as np
import pandas_profiling
from pandas_profiling import ProfileReport
%matplotlib inline

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from googletrans import Translator
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
import xgboost as xgb

from catboost import CatBoostClassifier, Pool
from hypopt import GridSearch

import warnings
warnings.filterwarnings('ignore')

### SMOTE Modeling
- Use SMOTE dataset and unused validate set to analyze veracity of using SMOTE data
- Start by training a simple CatBoost model on the SMOTE set and running the model on validation set to see difference in performance 

In [2]:
# Load cleaned data
df = pd.read_pickle('df_smote.pkl')
print(len(df))

# Transform ordinal
## Ordinal2
df.ordinal2 = df.ordinal2.replace(0,1)
df['ordinal2']= ss.boxcox(df.ordinal2, -0.1911)

## Ordinal1
df['ordinal1']= ss.boxcox(df.ordinal1, -1)

# Specify categorical columns
cat_cols = [x for x in df.columns if x not in ['ordinal1', 'ordinal2', 'label']]

# Convert categorical to str
for col in cat_cols:
    df[col]=df[col].astype(str)
    
# Prepare for modeling
X = df.drop(columns = ['label'])
y = [int(x) for x in df.label]

# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

80715


In [4]:
# Review with validation set
df_validate = pd.read_pickle('df_validate.pkl')

# Transform continuous
## Employee Size 
df_validate.ordinal2 = df_validate.ordinal2.replace(0,1)
df_validate['ordinal2']= ss.boxcox(df_validate.ordinal2, -0.1911)

## Tenant
df_validate['ordinal1']= ss.boxcox(df_validate.ordinal1, -1)


# Convert categorical to str
for col in cat_cols:
    df_validate[col]=df_validate[col].astype(str)
    
# Prepare for modeling
X_validate = df_validate.drop(columns = ['label'])
y_validate = [int(x) for x in df_validate.label]

In [5]:
# Train model
model =  CatBoostClassifier(learning_rate=0.061045, iterations=200)

model.fit(X_train, y_train,
                 eval_set=(X_validate, y_validate),
                 cat_features=cat_cols,
                 use_best_model=True)

0:	learn: 0.6075293	test: 0.5910932	best: 0.5910932 (0)	total: 263ms	remaining: 52.3s
1:	learn: 0.5422852	test: 0.5138741	best: 0.5138741 (1)	total: 410ms	remaining: 40.6s
2:	learn: 0.4862892	test: 0.4486636	best: 0.4486636 (2)	total: 554ms	remaining: 36.4s
3:	learn: 0.4444780	test: 0.3992821	best: 0.3992821 (3)	total: 701ms	remaining: 34.3s
4:	learn: 0.4080005	test: 0.3563958	best: 0.3563958 (4)	total: 860ms	remaining: 33.5s
5:	learn: 0.3787299	test: 0.3226737	best: 0.3226737 (5)	total: 1s	remaining: 32.4s
6:	learn: 0.3555741	test: 0.2959349	best: 0.2959349 (6)	total: 1.15s	remaining: 31.6s
7:	learn: 0.3343776	test: 0.2713300	best: 0.2713300 (7)	total: 1.28s	remaining: 30.8s
8:	learn: 0.3186814	test: 0.2536560	best: 0.2536560 (8)	total: 1.42s	remaining: 30.1s
9:	learn: 0.3068461	test: 0.2403005	best: 0.2403005 (9)	total: 1.56s	remaining: 29.6s
10:	learn: 0.2958828	test: 0.2277378	best: 0.2277378 (10)	total: 1.7s	remaining: 29.2s
11:	learn: 0.2872959	test: 0.2200538	best: 0.2200538 (11

95:	learn: 0.1139438	test: 0.0926345	best: 0.0926345 (95)	total: 13.2s	remaining: 14.3s
96:	learn: 0.1135823	test: 0.0923780	best: 0.0923780 (96)	total: 13.3s	remaining: 14.2s
97:	learn: 0.1129588	test: 0.0922389	best: 0.0922389 (97)	total: 13.5s	remaining: 14s
98:	learn: 0.1127933	test: 0.0921912	best: 0.0921912 (98)	total: 13.6s	remaining: 13.9s
99:	learn: 0.1119309	test: 0.0918945	best: 0.0918945 (99)	total: 13.8s	remaining: 13.8s
100:	learn: 0.1114989	test: 0.0916159	best: 0.0916159 (100)	total: 13.9s	remaining: 13.6s
101:	learn: 0.1107743	test: 0.0914512	best: 0.0914512 (101)	total: 14s	remaining: 13.5s
102:	learn: 0.1104020	test: 0.0913219	best: 0.0913219 (102)	total: 14.1s	remaining: 13.3s
103:	learn: 0.1101285	test: 0.0911557	best: 0.0911557 (103)	total: 14.3s	remaining: 13.2s
104:	learn: 0.1094996	test: 0.0909762	best: 0.0909762 (104)	total: 14.4s	remaining: 13s
105:	learn: 0.1092578	test: 0.0908640	best: 0.0908640 (105)	total: 14.5s	remaining: 12.9s
106:	learn: 0.1088862	test

188:	learn: 0.0905896	test: 0.0829976	best: 0.0829976 (188)	total: 25.2s	remaining: 1.47s
189:	learn: 0.0904350	test: 0.0828966	best: 0.0828966 (189)	total: 25.4s	remaining: 1.33s
190:	learn: 0.0903367	test: 0.0828487	best: 0.0828487 (190)	total: 25.5s	remaining: 1.2s
191:	learn: 0.0902882	test: 0.0828231	best: 0.0828231 (191)	total: 25.6s	remaining: 1.07s
192:	learn: 0.0900758	test: 0.0826728	best: 0.0826728 (192)	total: 25.8s	remaining: 934ms
193:	learn: 0.0899084	test: 0.0825927	best: 0.0825927 (193)	total: 25.9s	remaining: 800ms
194:	learn: 0.0898230	test: 0.0825199	best: 0.0825199 (194)	total: 26s	remaining: 667ms
195:	learn: 0.0896130	test: 0.0824937	best: 0.0824937 (195)	total: 26.1s	remaining: 533ms
196:	learn: 0.0895691	test: 0.0824478	best: 0.0824478 (196)	total: 26.2s	remaining: 400ms
197:	learn: 0.0891717	test: 0.0823690	best: 0.0823690 (197)	total: 26.4s	remaining: 266ms
198:	learn: 0.0887279	test: 0.0822348	best: 0.0822348 (198)	total: 26.5s	remaining: 133ms
199:	learn: 0

<catboost.core.CatBoostClassifier at 0x12921f438>

In [6]:
### Review params

def quick_scores(X_test, y_test):
    y_pred = model.predict(X_test)
    print('Recall: {:.4f}'.format(recall_score(y_test, y_pred)))
    print('Precision: {:.4f}'.format(precision_score(y_test, y_pred)))
    print('Accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))
    print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

# Review test data
quick_scores(X_test, y_test)
quick_scores(X_validate, y_validate)

Recall: 0.9452
Precision: 0.9435
Accuracy: 0.9721
Confusion Matrix: 
[[19596   377]
 [  365  6298]]
Recall: 0.3103
Precision: 0.1525
Accuracy: 0.9722
Confusion Matrix: 
[[29292   600]
 [  240   108]]
