In [15]:
from pandas import read_csv

data = read_csv('../data/processed/data.csv')

In [16]:
TARGET = 'Attrition_Yes'
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
TRAIN_SPLIT = 1 - TEST_SPLIT - VAL_SPLIT

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test_, y_train, y_test_ = train_test_split(data.drop(columns=[TARGET]), data[TARGET], train_size=TRAIN_SPLIT)
x_test, x_val, y_test, y_val = train_test_split(x_test_, y_test_, train_size=TEST_SPLIT / (TEST_SPLIT + VAL_SPLIT))

In [18]:
from catboost import CatBoostClassifier
from optuna import Trial
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

preprocessing_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
]).fit(x_train, y_train)

scaling_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
    (
        'Scaling',
        StandardScaler()
    )
]).fit(x_train, y_train)

x_train_preprocessed = preprocessing_pipeline.transform(x_train)
x_train_scaled = scaling_pipeline.transform(x_train)
x_test_preprocessed = preprocessing_pipeline.transform(x_test)
x_test_scaled = scaling_pipeline.transform(x_test)


def pr_auc(precision, recall) -> float:
    return auc(recall, precision)


def metric(probas) -> float:
    return pr_auc(*precision_recall_curve(y_test, probas)[:2])


def objective_logreg(trial: Trial) -> float:
    model = LogisticRegression(
        penalty=trial.suggest_categorical('penalty', ['l1', 'l2']),
        C=trial.suggest_float('C', low=0.1, high=10.),
        solver='liblinear'
    )
    model = model.fit(x_train_scaled, y_train)
    probas = model.predict_proba(x_test_scaled)[:, 1]
    return metric(probas)


def objective_decision_tree(trial: Trial) -> float:
    model = DecisionTreeClassifier(
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        max_depth=trial.suggest_int('max_depth', low=10, high=100)
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_random_forest(trial: Trial) -> float:
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=5, high=20),
        max_depth=trial.suggest_int('max_depth', low=3, high=20),
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_adaboost(trial: Trial) -> float:
    model = AdaBoostClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=10, high=100),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True)
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_catboost(trial: Trial) -> float:
    model = CatBoostClassifier(
        iterations=trial.suggest_int('iterations', low=10, high=100),
        depth=trial.suggest_int('depth', low=1, high=5),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True),
        silent=True
    )
    model = model.fit(x_train_preprocessed, y_train.astype('int'))
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_mlp(trial: Trial) -> float:
    model = MLPClassifier(
        max_iter=2000,
        hidden_layer_sizes=(trial.suggest_int('layer_1', 10, 100), trial.suggest_int('layer_2', 10, 100))
    )
    model = model.fit(x_train_scaled, y_train)
    probas = model.predict_proba(x_test_scaled)[:, 1]
    return metric(probas)


In [19]:
from datetime import datetime

from optuna import create_study, logging

logging.set_verbosity(logging.WARNING)

studies = []

for objective in [objective_logreg, objective_decision_tree, objective_random_forest, objective_adaboost,
                  objective_catboost, objective_mlp]:
    name = '_'.join(objective.__name__.split('_')[1:])
    print(f'[{datetime.now():%H:%M:%S}] Starting {name}')
    study = create_study(direction='maximize')
    if objective == objective_mlp:
        study.optimize(objective, n_trials=100, n_jobs=-1)
    else:
        study.optimize(objective, n_trials=1000, n_jobs=-1)
    studies.append(study)
    print(f'[{datetime.now():%H:%M:%S}] Finished {name}')

[11:50:59] Starting logreg
[11:51:35] Finished logreg
[11:51:35] Starting decision_tree
[11:52:12] Finished decision_tree
[11:52:12] Starting random_forest
[11:54:01] Finished random_forest
[11:54:01] Starting adaboost
[11:59:17] Finished adaboost
[11:59:17] Starting catboost
[12:01:06] Finished catboost
[12:01:06] Starting mlp
[12:03:39] Finished mlp


In [20]:
from dataclasses import dataclass, field


@dataclass
class ModelData:
    clf_type: type
    preprocessing_pipeline: Pipeline
    additional_kwargs: dict = field(default_factory=lambda: dict())


def restore_classifier(params: dict, model_data: ModelData):
    params.update(model_data.additional_kwargs)
    print(params)
    model = model_data.clf_type(**params)
    if model_data.clf_type == CatBoostClassifier:
        return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train.astype('int'))
    return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train)

In [21]:
models_data = [
    ModelData(LogisticRegression, scaling_pipeline, {'solver': 'liblinear'}),
    ModelData(DecisionTreeClassifier, preprocessing_pipeline),
    ModelData(RandomForestClassifier, preprocessing_pipeline),
    ModelData(AdaBoostClassifier, preprocessing_pipeline),
    ModelData(CatBoostClassifier, preprocessing_pipeline, {'silent': True}),
    # MLP not included as it has list of params
]


In [22]:
model_pipelines = []

for study, model_data in zip(studies, models_data):
    model_pipelines.append(Pipeline(steps=[
        (
            'Preprocessing',
            model_data.preprocessing_pipeline
        ),
        (
            'Classifier',
            restore_classifier(study.best_params, model_data)
        )
    ]))

{'penalty': 'l2', 'C': 9.951831576785969, 'solver': 'liblinear'}
{'criterion': 'gini', 'max_depth': 79}
{'n_estimators': 19, 'max_depth': 16, 'criterion': 'entropy'}
{'n_estimators': 100, 'learning_rate': 0.9001844417424277}
{'iterations': 90, 'depth': 5, 'learning_rate': 0.8605101576987451, 'silent': True}


In [23]:
model_pipelines.append(
    Pipeline(steps=[
        (
            'Preprocessing',
            scaling_pipeline
        ),
        (
            'Classifier',
            MLPClassifier(
                max_iter=2000,
                hidden_layer_sizes=(
                    studies[-1].best_params['layer_1'], studies[-1].best_params['layer_2']
                )
            ).fit(x_train_scaled, y_train)
        )
    ])
)

In [24]:
for study in studies:
    print(study.best_params)

{'penalty': 'l2', 'C': 9.951831576785969}
{'criterion': 'gini', 'max_depth': 79}
{'n_estimators': 19, 'max_depth': 16, 'criterion': 'entropy'}
{'n_estimators': 100, 'learning_rate': 0.9001844417424277}
{'iterations': 90, 'depth': 5, 'learning_rate': 0.8605101576987451}
{'layer_1': 59, 'layer_2': 42}


In [25]:
def get_model_name(model) -> str:
    return str(type(model)).split('.')[-1][:-2]

In [26]:
from plotly.express import bar
from plotly.graph_objects import Scatter
from plotly.subplots import make_subplots

figure = make_subplots(rows=1, cols=2, subplot_titles=['PR Curves', 'Area under PR curve'])

pr_aucs = dict()

for pipeline in model_pipelines:
    probas = pipeline.predict_proba(x_val)[:, 1]
    clf_name = get_model_name(pipeline[-1])
    precision, recall, _ = precision_recall_curve(y_val.astype('int'), probas)
    pr_aucs[clf_name] = pr_auc(precision, recall)
    figure.add_trace(Scatter(x=precision, y=recall, name=clf_name), row=1, col=1)

figure.add_trace(bar(x=list(pr_aucs.keys()), y=list(pr_aucs.values())).data[0], row=1, col=2)

In [27]:
import numpy as np

from pandas import DataFrame
from plotly.express import bar


def plot_feature_importance(importances: np.ndarray[float], classifier: str) -> None:
    bar(DataFrame({
        'y': importances
    }, index=x_train.columns).sort_values(by='y'), labels={
        'x': 'Feature importance',
        'y': 'Feature name'
    }, orientation='h', height=1000, title=classifier + ' feature importance').show()

In [28]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty='l1', solver='liblinear', C=0.005)
logreg = logreg.fit(x_train_scaled, y_train)
plot_feature_importance(np.abs(logreg.coef_.reshape(-1)), 'Logistic regression')

In [9]:
logreg.coef_

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [29]:
from sklearn.linear_model import LogisticRegression

logreg = model_pipelines[0][-1]
plot_feature_importance(np.abs(logreg.coef_.reshape(-1)), 'Logistic regression')

In [30]:
decision_tree = model_pipelines[1][-1]

plot_feature_importance(decision_tree.feature_importances_, 'Decision Tree')

In [31]:
random_forest = model_pipelines[2][-1]

plot_feature_importance(random_forest.feature_importances_, 'Random forest')

In [33]:
from pickle import dump

for ppl in model_pipelines:
    ppl_name = get_model_name(ppl[-1])
    with open(f'../models/{ppl_name}.pkl', 'wb') as f:
        dump(ppl, f)