In [6]:
from pandas import read_csv


data = read_csv('../data/processed/data.csv')

In [8]:
TARGET = 'Attrition_Yes'
TEST_SPLIT = 0.2
TRAIN_SPLIT = 1 - TEST_SPLIT

In [9]:
x, y = data.drop(columns=[TARGET]), data[TARGET]
train_data_len = int(len(x) * TRAIN_SPLIT)
x_train, y_train = x[:train_data_len], y[:train_data_len]
x_test, y_test = x[train_data_len:], y[train_data_len:]

In [21]:
import numpy as np

from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

MAX_DEPTH = 30

preprocessing_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
])

models = [
    GridSearchCV(DecisionTreeClassifier(), param_grid={
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': list(range(5, MAX_DEPTH + 1, 5))
    }),
    GridSearchCV(AdaBoostClassifier(), param_grid={
        'n_estimators': list(range(10, 101, 10)),
        'learning_rate': np.logspace(-3, -1, 5)
    }),
    GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid={
        'n_estimators': list(range(10, 100, 20)),
        'max_depth': list(range(3, MAX_DEPTH + 1, 5)),
        'criterion': ['gini', 'entropy', 'log_loss'],
    }),
    GridSearchCV(CatBoostClassifier(loss_function='Logloss', verbose=False), param_grid={
        'iterations': list(range(10, 101, 10)),
        'depth': list(range(1, 5)),
        'learning_rate': np.logspace(-3, -1, 5)
    })
]

In [22]:
x_train_transformed = preprocessing_pipeline.fit_transform(x_train)

for search in models[:-1]:  # CatBoost doesn't wanna work with True/False
    print(search)
    search.fit(x_train_transformed, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [5, 10, 15, 20, 25, 30]})
GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': array([0.001     , 0.00316228, 0.01      , 0.03162278, 0.1       ]),
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]})
GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [3, 8, 13, 18, 23, 28],
                         'n_estimators': [10, 30, 50, 70, 90]})
GridSearchCV(estimator=<catboost.core.CatBoostClassifier object at 0x000001CACE347E10>,
             param_grid={'depth': [1, 2, 3, 4],
                         'iterations': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                        100],
                    

Traceback (most recent call last):
  File "C:\Users\tyoma\miniforge-pypy3\envs\magic-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\tyoma\miniforge-pypy3\envs\magic-env\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\tyoma\miniforge-pypy3\envs\magic-env\Lib\site-packages\catboost\core.py", line 5418, in score
    raise CatBoostError('predicted classes have string type but specified y is boolean')
_catboost.CatBoostError: predicted classes have string type but specified y is boolean

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan

In [93]:
models[-1].fit(x_train_transformed, y_train.astype('int'))

In [94]:
from plotly.graph_objects import Scatter
from plotly.express import bar
from plotly.subplots import make_subplots

from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, roc_curve

x_test_ = preprocessing_pipeline.transform(x_test)

figure = make_subplots(rows=2, cols=2, subplot_titles=['ROC Curves', 'Accuracy', 'Recall', 'ROC AUC'])

accuracies = dict()
recalls = dict()
rocauc = dict()

for clf in map(lambda z: z.best_estimator_, models):
    classifier_name = str(type(clf)).split('.')[-1][:-2]
    labels = clf.predict(x_test_).astype('bool')
    probas = clf.predict_proba(x_test_)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, probas)
    figure.add_trace(Scatter(x=fpr, y=tpr, name=classifier_name), row=1, col=1)
    accuracies[classifier_name] = accuracy_score(y_test, labels)
    recalls[classifier_name] = recall_score(y_test, labels)
    rocauc[classifier_name] = roc_auc_score(y_test, probas)

figure.add_trace(bar(x=list(accuracies.keys()), y=list(accuracies.values())).data[0], row=1, col=2)
figure.add_trace(bar(x=list(recalls.keys()), y=list(recalls.values())).data[0], row=2, col=1)
figure.add_trace(bar(x=list(rocauc.keys()), y=list(rocauc.values())).data[0], row=2, col=2)

figure.update_layout(height=1000, width=1000)
figure.show()

In [95]:
import pickle

for clf in map(lambda z: z.best_estimator_, models):
    classifier_name = str(type(clf)).split('.')[-1][:-2]
    with open(f'../models/{classifier_name}.pkl', 'wb') as file:
        pickle.dump(clf, file)