### Library

In [1]:
# starter libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# hide warnings
import warnings
warnings.filterwarnings("ignore")

# date time libraries
from datetime import date
from datetime import datetime
today = date.today()

# set options for display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.2f}'.format

# pre-processing
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# model metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# model algorithms
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import xgboost

# model explainer
import shap
import pickle

In [2]:
# developer's libraries
from data_gathering_employees import *
from data_processing_employees import *
from model_explainer_employees import *
from model_performance_employees import *

### Main

In [3]:
# data gathering: sprout 1000 sample records
df_gathered_sprout_1000 = data_gathering_sprout_employees_sample()
# data processing: feature engineering and selection
df_processed_sprout_1000 = data_processing_sprout_employees(df_gathered_sprout_1000)

# split: 1 = churn; 0 = active
df_churned = df_processed_sprout_1000[df_processed_sprout_1000['Status']==1].copy()
df_active = df_processed_sprout_1000[df_processed_sprout_1000['Status']==0].copy()

# split: holdout = 200 samples, ml_training = 800 samples
twenty_percent=round(df_churned.shape[0]*0.20)
df_holdout = pd.concat([df_churned[:twenty_percent],df_active[:twenty_percent]]).reset_index(drop=True)
df_ml = pd.concat([df_churned[twenty_percent:],df_active[twenty_percent:]]).reset_index(drop=True)

# model preparation: x = features; y = predictor
x = df_ml.drop(["Status"], axis=1)
y = df_ml["Status"]

# train and test split: 70% = training; 30% = testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=30)

# model training: Decision Tree
clf_dt = DecisionTreeClassifier(criterion="gini", min_samples_split=4, min_samples_leaf=5, max_depth=20, random_state=25)
clf_dt.fit(x_train,y_train)

# model training: explainer for Decision Tree
dt_explainer_train = model_explainer_training(x_train,y_train)
# model explanation: initialization
df_dt_init_explainer_train = model_explanation_initialization(x_test, dt_explainer_train)
# model explanation: transpose - factor vs score
df_dt_final_explainer_train = model_explainer_transpose(df_dt_init_explainer_train)

# model performance: training
y_pred_train, clf_accuracy_train, clf_confusion_matrix_train, df_classification_report_train = get_model_performance(x_test, y_test, clf_dt)

# model: feature importance - training
df_feature_importance_dt_train = get_model_feature_importance(x_train, clf_dt)

# save the feature importance - training
filename_feature_importance = "feature_importance_{}.sav".format(datetime.now().strftime('%Y_%m_%d'))
pickle.dump(df_feature_importance_dt_train, open(filename_feature_importance, 'wb'))

# model performance: holdout testing
df_holdout_testing = df_holdout.drop(["Status"], axis=1)
actual = df_holdout["Status"]

y_pred_h_test, clf_accuracy_h_test, clf_confusion_matrix_h_test, df_classification_report_h_test = get_model_performance(df_holdout_testing, actual, clf_dt)

# save the model: pickle
filename_model = "model_{}.sav".format(datetime.now().strftime('%Y_%m_%d'))
pickle.dump(clf_dt, open(filename_model, 'wb'))

# save the explainer: pickle
filename_explainer = "explainer_{}.sav".format(datetime.now().strftime('%Y_%m_%d'))
pickle.dump(dt_explainer_train, open(filename_explainer, 'wb'))