In [11]:
from pandas import read_csv

data = read_csv('../data/interim/data.csv', index_col='EmployeeID')
data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})

In [12]:
TARGET = 'Attrition'
TEST_SPLIT = 0.1
VAL_SPLIT = 0.2
TRAIN_SPLIT = 1 - TEST_SPLIT - VAL_SPLIT

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test_, y_train, y_test_ = train_test_split(data.drop(columns=[TARGET]), data[TARGET], train_size=TRAIN_SPLIT)
x_test, x_val, y_test, y_val = train_test_split(x_test_, y_test_, train_size=TEST_SPLIT / (TEST_SPLIT + VAL_SPLIT))

In [14]:
from collections.abc import Callable, Generator
from functools import wraps
from typing import TypeVar, Any

from numpy.typing import NDArray
from optuna import Trial
from pandas import DataFrame

XType = NDArray | DataFrame
YType = NDArray | DataFrame

TrialFunction = Callable[[Trial, NDArray | DataFrame, NDArray | DataFrame], float]
Function = TypeVar('Function', bound=Callable[..., Any])


class TrialDataSupplier:
    def __init__(self, bundles: dict[str, tuple[XType, YType, XType, YType]]) -> None:
        self._bundles = bundles

    @staticmethod
    def _make_wrapper(trial_function: TrialFunction, bundle: tuple[XType, YType, XType, YType]) -> Callable[[Trial], float]:
        @wraps(trial_function)
        def wrapper(*args, **kwargs) -> float:
            return trial_function(*args, *bundle, **kwargs)
        return wrapper
    
    def supply_data(self, *items: tuple[TrialFunction, str]) -> Generator[Callable[[Trial], float], None, None]:
        for func, data_name in items:
            assert data_name in self._bundles
            yield TrialDataSupplier._make_wrapper(func, self._bundles[data_name])

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


numerical_columns = [idx for idx, col in enumerate(x_train.columns) if x_train[col].dtype != object]
categorical_columns = [idx for idx, col in enumerate(x_train.columns) if x_train[col].dtype == object]


preprocessing_pipeline = Pipeline([
    (
        'Fill missing values',
        SimpleImputer(strategy='most_frequent')
    ),
    (
        'OneHot',
        ColumnTransformer([
            ('OneHot', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_columns)
        ])
    )
]).fit(x_train)

scaling_pipeline = Pipeline([
    (
        'Fill missing values',
        SimpleImputer(strategy='most_frequent')
    ),
    (
        'OneHot&Scaling',
        ColumnTransformer([
            ('Scaling', StandardScaler(), numerical_columns),
            ('OneHot', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_columns),
        ])
    )
]).fit(x_train)

In [33]:
supplier = TrialDataSupplier({
    'base': (preprocessing_pipeline.transform(x_train), y_train, preprocessing_pipeline.transform(x_val), y_val),
    'scaled': (scaling_pipeline.transform(x_train), y_train, scaling_pipeline.transform(x_val), y_val),
    'no-oh': (scaling_pipeline[:-1].transform(x_train), y_train, scaling_pipeline[:-1].transform(x_val), y_val, categorical_columns)
})

In [36]:
from catboost import CatBoostClassifier
from optuna import Trial
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier


def pr_auc(precision, recall) -> float:
    return auc(recall, precision)


def metric(probas, y_val_) -> float:
    return pr_auc(*precision_recall_curve(y_val_, probas)[:2])


def objective_logreg(trial: Trial, x_train_, y_train_, x_val_, y_val_) -> float:
    model = LogisticRegression(
        penalty=trial.suggest_categorical('penalty', ['l1', 'l2']),
        C=trial.suggest_float('C', low=0.000001, high=10., log=True),
        solver='liblinear'
    )
    model = model.fit(x_train_, y_train_)
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


def objective_decision_tree(trial: Trial, x_train_, y_train_, x_val_, y_val_) -> float:
    model = DecisionTreeClassifier(
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        max_depth=trial.suggest_int('max_depth', low=10, high=100)
    )
    model = model.fit(x_train_, y_train_)
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


def objective_random_forest(trial: Trial, x_train_, y_train_, x_val_, y_val_) -> float:
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=5, high=20),
        max_depth=trial.suggest_int('max_depth', low=3, high=20),
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    )
    model = model.fit(x_train_, y_train_)
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


def objective_adaboost(trial: Trial, x_train_, y_train_, x_val_, y_val_) -> float:
    model = AdaBoostClassifier(
        n_estimators=trial.suggest_int('n_estimators', low=10, high=100),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True)
    )
    model = model.fit(x_train_, y_train_)
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


def objective_catboost(trial: Trial, x_train_, y_train_, x_val_, y_val_, cat_columns) -> float:
    model = CatBoostClassifier(
        iterations=trial.suggest_int('iterations', low=10, high=100),
        depth=trial.suggest_int('depth', low=1, high=5),
        learning_rate=trial.suggest_float('learning_rate', low=0.001, high=1, log=True),
        silent=True,
        cat_features=cat_columns
    )
    model = model.fit(x_train_, y_train_.astype('int'))
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


def objective_mlp(trial: Trial, x_train_, y_train_, x_val_, y_val_) -> float:
    model = MLPClassifier(
        max_iter=2000,
        hidden_layer_sizes=(trial.suggest_int('layer_1', 10, 100), trial.suggest_int('layer_2', 10, 100))
    )
    model = model.fit(x_train_, y_train_)
    probas = model.predict_proba(x_val_)[:, 1]
    return metric(probas, y_val_)


In [37]:
from datetime import datetime

from optuna import create_study, logging

logging.set_verbosity(logging.WARNING)

studies = []

for objective in supplier.supply_data((objective_logreg, 'scaled'), (objective_decision_tree, 'base'), 
                                      (objective_random_forest, 'base'), (objective_adaboost, 'base'), 
                                      (objective_catboost, 'no-oh'), (objective_mlp, 'scaled')):
    name = '_'.join(objective.__name__.split('_')[1:])
    print(f'[{datetime.now():%H:%M:%S}] Starting {name}')
    study = create_study(direction='maximize')
    if objective == objective_mlp:
        study.optimize(objective, n_trials=100, n_jobs=-1)
    else:
        study.optimize(objective, n_trials=100, n_jobs=-1)
    studies.append(study)
    print(f'[{datetime.now():%H:%M:%S}] Finished {name}')


[22:49:50] Starting logreg
[22:49:52] Finished logreg
[22:49:52] Starting decision_tree
[22:49:53] Finished decision_tree
[22:49:53] Starting random_forest
[22:50:01] Finished random_forest
[22:50:01] Starting adaboost
[22:50:23] Finished adaboost
[22:50:23] Starting catboost
[22:51:10] Finished catboost
[22:51:10] Starting mlp
[22:54:30] Finished mlp


In [38]:
from dataclasses import dataclass, field


@dataclass
class ModelData:
    clf_type: type
    preprocessing_pipeline: Pipeline
    additional_kwargs: dict = field(default_factory=lambda: dict())


def restore_classifier(params: dict, model_data: ModelData):
    params.update(model_data.additional_kwargs)
    print(params)
    model = model_data.clf_type(**params)
    if model_data.clf_type == CatBoostClassifier:
        return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train.astype('int'))
    return model.fit(model_data.preprocessing_pipeline.transform(x_train), y_train)

In [45]:
models_data = [
    ModelData(LogisticRegression, scaling_pipeline, {'solver': 'liblinear'}),
    ModelData(DecisionTreeClassifier, preprocessing_pipeline),
    ModelData(RandomForestClassifier, preprocessing_pipeline),
    ModelData(AdaBoostClassifier, preprocessing_pipeline),
    ModelData(CatBoostClassifier, preprocessing_pipeline[:-1], {'silent': True, 'cat_features': categorical_columns}),
    # MLP not included as it has list of params
]


In [56]:
print(models_data[-1].preprocessing_pipeline)

Pipeline(steps=[('Fill missing values',
                 SimpleImputer(strategy='most_frequent'))])


In [46]:
model_pipelines = []

for study, model_data in zip(studies, models_data):
    model_pipelines.append(Pipeline(steps=[
        (
            'Preprocessing',
            model_data.preprocessing_pipeline
        ),
        (
            'Classifier',
            restore_classifier(study.best_params, model_data)
        )
    ]))

{'penalty': 'l1', 'C': 4.409597566646395e-06, 'solver': 'liblinear'}
{'criterion': 'gini', 'max_depth': 85}
{'n_estimators': 18, 'max_depth': 18, 'criterion': 'gini'}
{'n_estimators': 32, 'learning_rate': 0.003578747982956951}
{'iterations': 95, 'depth': 5, 'learning_rate': 0.4564577027179679, 'silent': True, 'cat_features': [1, 2, 4, 5, 6, 8, 9, 19, 20, 21, 22, 23]}


In [47]:
model_pipelines.append(
    Pipeline(steps=[
        (
            'Preprocessing',
            scaling_pipeline
        ),
        (
            'Classifier',
            MLPClassifier(
                max_iter=2000,
                hidden_layer_sizes=(
                    studies[-1].best_params['layer_1'], studies[-1].best_params['layer_2']
                )
            ).fit(scaling_pipeline.transform(x_train), y_train)
        )
    ])
)

In [53]:
preprocessing_pipeline[:-1]

In [48]:
for study in studies:
    print(study.best_params)

{'penalty': 'l1', 'C': 4.409597566646395e-06}
{'criterion': 'gini', 'max_depth': 85}
{'n_estimators': 18, 'max_depth': 18, 'criterion': 'gini'}
{'n_estimators': 32, 'learning_rate': 0.003578747982956951}
{'iterations': 95, 'depth': 5, 'learning_rate': 0.4564577027179679}
{'layer_1': 72, 'layer_2': 82}


In [49]:
def get_model_name(model) -> str:
    return str(type(model)).split('.')[-1][:-2]

In [50]:
from plotly.express import bar
from plotly.graph_objects import Scatter
from plotly.subplots import make_subplots

figure = make_subplots(rows=1, cols=2, subplot_titles=['PR Curves', 'Area under PR curve'])

pr_aucs = dict()

for pipeline in model_pipelines:
    probas = pipeline.predict_proba(x_val)[:, 1]
    clf_name = get_model_name(pipeline[-1])
    precision, recall, _ = precision_recall_curve(y_val.astype('int'), probas)
    pr_aucs[clf_name] = pr_auc(precision, recall)
    figure.add_trace(Scatter(x=recall, y=precision, name=clf_name), row=1, col=1)

figure.add_trace(bar(x=list(pr_aucs.keys()), y=list(pr_aucs.values())).data[0], row=1, col=2)

In [51]:
from pickle import dump

for ppl in model_pipelines:
    ppl_name = get_model_name(ppl[-1])
    with open(f'../models/{ppl_name}.pkl', 'wb') as f:
        dump(ppl, f)