In [1]:
from pandas import read_csv

data = read_csv('../data/processed/data.csv')

In [8]:
TARGET = 'Attrition_Yes'
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
TRAIN_SPLIT = 1 - TEST_SPLIT - VAL_SPLIT

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test_, y_train, y_test_ = train_test_split(data.drop(columns=[TARGET]), data[TARGET], train_size=TRAIN_SPLIT)
x_test, x_val, y_test, y_val = train_test_split(x_test_, y_test_, train_size=TEST_SPLIT / (TEST_SPLIT + VAL_SPLIT))

In [139]:
from catboost import CatBoostClassifier
from optuna import Trial
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

preprocessing_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
]).fit(x_train, y_train)

scaling_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
    (
        'Scaling',
        StandardScaler()
    )
]).fit(x_train, y_train)

x_train_preprocessed = preprocessing_pipeline.transform(x_train)
x_train_scaled = scaling_pipeline.transform(x_train)
x_test_preprocessed = preprocessing_pipeline.transform(x_test)
x_test_scaled = scaling_pipeline.transform(x_test)


def pr_auc(precision, recall) -> float:
    return auc(recall, precision)


def metric(probas) -> float:
    return pr_auc(*precision_recall_curve(y_test, probas)[:2])


def objective_logreg(trial: Trial) -> float:
    model = LogisticRegression(
        penalty=trial.suggest_categorical('penalty', ['l1', 'l2']),
        C=trial.suggest_float('C', low=0.00001, high=10., log=True),
        solver='liblinear'
    )
    model = model.fit(x_train_scaled, y_train)
    probas = model.predict_proba(x_test_scaled)[:, 1]
    return metric(probas)


def objective_decision_tree(trial: Trial) -> float:
    model = DecisionTreeClassifier(
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        max_depth=trial.suggest_int('max_depth', low=10, high=100)
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_random_forest(trial: Trial) -> float:
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=5, high=20),
        max_depth=trial.suggest_int('max_depth', low=3, high=20),
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_adaboost(trial: Trial) -> float:
    model = AdaBoostClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=10, high=100),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True)
    )
    model = model.fit(x_train_preprocessed, y_train)
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_catboost(trial: Trial) -> float:
    model = CatBoostClassifier(
        iterations=trial.suggest_int('iterations', low=10, high=100),
        depth=trial.suggest_int('depth', low=1, high=5),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True),
        silent=True
    )
    model = model.fit(x_train_preprocessed, y_train.astype('int'))
    probas = model.predict_proba(x_test_preprocessed)[:, 1]
    return metric(probas)


def objective_mlp(trial: Trial) -> float:
    model = MLPClassifier(
        max_iter=2000,
        hidden_layer_sizes=(trial.suggest_int('layer_1', 10, 100), trial.suggest_int('layer_2', 10, 100))
    )
    model = model.fit(x_train_scaled, y_train)
    probas = model.predict_proba(x_test_scaled)[:, 1]
    return metric(probas)


In [140]:
from datetime import datetime

from optuna import create_study, logging

logging.set_verbosity(logging.WARNING)

studies = []

for objective in [objective_logreg, objective_decision_tree, objective_random_forest, objective_adaboost,
                  objective_catboost, objective_mlp]:
    name = '_'.join(objective.__name__.split('_')[1:])
    print(f'[{datetime.now():%H:%M:%S}] Starting {name}')
    study = create_study(direction='maximize')
    study.optimize(objective, n_trials=1000)
    studies.append(study)
    print(f'[{datetime.now():%H:%M:%S}] Finished {name}')

[16:00:09] Starting logreg
[16:00:38] Finished logreg
[16:00:38] Starting decision_tree
[16:01:58] Finished decision_tree
[16:01:58] Starting random_forest
[16:05:08] Finished random_forest
[16:05:08] Starting adaboost
[16:16:47] Finished adaboost
[16:16:47] Starting catboost
[16:21:42] Finished catboost
[16:21:42] Starting mlp
[16:46:59] Finished mlp


In [148]:
from dataclasses import dataclass, field


@dataclass
class ModelData:
    clf_type: type
    preprocessing_pipeline: Pipeline
    additional_kwargs: dict = field(default_factory=lambda: dict())


def restore_classifier(params: dict, model_data: ModelData):
    params.update(model_data.additional_kwargs)
    print(params)
    model = model_data.clf_type(**params)
    if model_data.clf_type == CatBoostClassifier:
        return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train.astype('int'))
    return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train)

In [149]:
models_data = [
    ModelData(LogisticRegression, scaling_pipeline, {'solver': 'liblinear'}),
    ModelData(DecisionTreeClassifier, preprocessing_pipeline),
    ModelData(RandomForestClassifier, preprocessing_pipeline),
    ModelData(AdaBoostClassifier, preprocessing_pipeline),
    ModelData(CatBoostClassifier, preprocessing_pipeline, {'silent': True}),
    # MLP not included as it has list of params
]


In [150]:
model_pipelines = []

for study, model_data in zip(studies, models_data):
    model_pipelines.append(Pipeline(steps=[
        (
            'Preprocessing',
            model_data.preprocessing_pipeline
        ),
        (
            'Classifier',
            restore_classifier(study.best_params, model_data)
        )
    ]))

{'penalty': 'l1', 'C': 0.000788553066333969, 'solver': 'liblinear'}
{'criterion': 'entropy', 'max_depth': 50}
{'n_estimators': 20, 'max_depth': 20, 'criterion': 'log_loss'}
{'n_estimators': 88, 'learning_rate': 0.4273070296679926}
{'iterations': 97, 'depth': 5, 'learning_rate': 0.46851204579618916, 'silent': True}


In [151]:
model_pipelines.append(
    Pipeline(steps=[
        (
            'Preprocessing',
            scaling_pipeline
        ),
        (
            'Classifier',
            MLPClassifier(
                max_iter=2000,
                hidden_layer_sizes=(
                    studies[-1].best_params['layer_1'], studies[-1].best_params['layer_2']
                )
            ).fit(x_train_scaled, y_train)
        )
    ])
)

In [152]:
def get_model_name(model) -> str:
    return str(type(model)).split('.')[-1][:-2]

In [153]:
from plotly.express import bar
from plotly.graph_objects import Scatter
from plotly.subplots import make_subplots

figure = make_subplots(rows=1, cols=2, subplot_titles=['PR Curves', 'Area under PR curve'])

pr_aucs = dict()

for pipeline in model_pipelines:
    probas = pipeline.predict_proba(x_val)[:, 1]
    clf_name = get_model_name(pipeline[-1])
    precision, recall, _ = precision_recall_curve(y_val.astype('int'), probas)
    pr_aucs[clf_name] = pr_auc(precision, recall)
    figure.add_trace(Scatter(x=precision, y=recall, name=clf_name), row=1, col=1)

figure.add_trace(bar(x=list(pr_aucs.keys()), y=list(pr_aucs.values())).data[0], row=1, col=2)

In [166]:
import numpy as np

from pandas import DataFrame
from plotly.express import bar

logreg = model_pipelines[0][-1]

bar(DataFrame({
    'y': np.abs(logreg.coef_.reshape(-1))
}, index=x_train.columns).sort_values(by='y'), labels={
    'x': 'Feature importance',
    'y': 'Feature name'
}, orientation='h')

In [155]:
model_pipelines[0]

In [165]:
LogisticRegression(solver='liblinear').fit(x_train_scaled, y_train).coef_

array([[-0.01509738, -0.01509738, -0.27649034, -0.02760245,  0.00934667,
        -0.04466598,  0.30858811,  0.05013648, -0.10324724, -0.43405972,
        -0.17254635,  0.01282212,  0.50038373, -0.51305641,  0.05205367,
        -0.05655733, -0.07953274,  0.58206127,  0.34319307, -0.31020892,
        -0.44223681, -0.00712703,  0.05754153, -0.06399304, -0.03331566,
        -0.29119948, -0.11555304, -0.31782719, -0.15690918, -0.26621801,
         0.03127729, -0.0315029 ,  0.0323988 , -0.12062128, -0.15697214,
         0.13212425,  0.07354765,  0.08211557, -0.0449946 ,  0.0417018 ,
         0.5214989 , -0.16782198, -0.01045784,  0.01054508,  0.02688918,
         0.08638411,  0.02020007,  0.09694415,  0.10731476,  0.13742709,
         0.12781627,  0.10227174,  0.10976775,  0.18952709]])

In [133]:
preprocessing_pipeline.transform(x_train)

array([[4.254e+03, 4.254e+03, 2.000e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.042e+03, 2.042e+03, 4.100e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.222e+03, 1.222e+03, 3.500e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [4.377e+03, 4.377e+03, 5.900e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.081e+03, 2.081e+03, 4.600e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.578e+03, 3.578e+03, 3.100e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [132]:
models_data[0].preprocessing_pipeline.transform(x_train)

array([[ 1.62004231,  1.62004231, -1.86567507, ..., -0.34066415,
         0.81550509, -0.55102791],
       [-0.11645444, -0.11645444,  0.43687634, ..., -0.34066415,
        -1.22623392,  1.81479011],
       [-0.7601829 , -0.7601829 , -0.22099549, ..., -0.34066415,
         0.81550509, -0.55102791],
       ...,
       [ 1.71660158,  1.71660158,  2.41049183, ..., -0.34066415,
         0.81550509, -0.55102791],
       [-0.08583809, -0.08583809,  0.98510286, ..., -0.34066415,
         0.81550509, -0.55102791],
       [ 1.08935886,  1.08935886, -0.65957671, ..., -0.34066415,
         0.81550509, -0.55102791]])

In [128]:
LogisticRegression(**studies[0].best_params, solver='liblinear').fit(models_data[0].preprocessing_pipeline.transform(x_train_scaled), y_train).coef_

array([[ 3.27302962e-04, -3.52176681e-04, -2.86992748e-02,
         0.00000000e+00,  0.00000000e+00, -2.07487069e-06,
         1.39257614e-02,  0.00000000e+00,  0.00000000e+00,
        -2.96114111e-02,  0.00000000e+00, -3.85483034e-03,
         0.00000000e+00, -4.57874467e-02,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0

In [113]:
logreg.coef_

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.06287193, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [86]:
studies[0].best_params

{'penalty': 'l1', 'C': 0.005242636176936646}

In [171]:
from pickle import dump

for ppl in model_pipelines:
    ppl_name = get_model_name(ppl[-1])
    with open(f'../models/{ppl_name}.pkl', 'wb') as f:
        dump(ppl, f)