In [5]:
from pandas import read_csv

data = read_csv('../data/processed/data.csv')

In [6]:
TARGET = 'Attrition_Yes'
TEST_SPLIT = 0.2
TRAIN_SPLIT = 1 - TEST_SPLIT

In [7]:
x, y = data.drop(columns=[TARGET]), data[TARGET]
train_data_len = int(len(x) * TRAIN_SPLIT)
x_train, y_train = x[:train_data_len], y[:train_data_len]
x_test, y_test = x[train_data_len:], y[train_data_len:]

In [40]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

MAX_DEPTH = 30

preprocessing_pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
    (
        'Scale the data',
        StandardScaler()
    )
])

hidden_sizes = []
for j in range(1, 4):
    hidden_sizes.extend([i] * j for i in range(20, 100, 20))

search = GridSearchCV(MLPClassifier(max_iter=2000), param_grid={
    'hidden_layer_sizes': hidden_sizes
})

In [41]:
search = search.fit(preprocessing_pipeline.fit_transform(x_train), y_train)

In [42]:
from plotly.express import area
from plotly.graph_objects import Figure
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, roc_curve


def evaluate_classifier(clf) -> tuple[float, float, float, Figure]:
    x_test_ = preprocessing_pipeline.transform(x_test)
    labels = clf.predict(x_test_)
    probas = clf.predict_proba(x_test_)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, probas)
    

    figure = area(x=fpr, y=tpr, title=f'ROC curve of {str(type(clf)).split(".")[-1][:-2]}',
                  labels={'x': 'False Positive Rate', 'y': 'True Positive Rate'},
                  width=500, height=500)
    figure.add_shape(type='line', line={'dash': 'dash'}, x0=0, x1=1, y0=0, y1=1)
    return accuracy_score(y_test, labels), recall_score(y_test, labels), roc_auc_score(y_test, probas), figure


In [43]:
*scores, rocauc = evaluate_classifier(search.best_estimator_)

In [44]:
for name, score in zip(['Accuracy', 'Recall', 'ROC AUC Score'], scores):
    print(f'{name}: {score}')

Accuracy: 1.0
Recall: 1.0
ROC AUC Score: 1.0


In [45]:
rocauc.show()

In [47]:
search.best_estimator_

In [53]:
import pickle

with open('../models/mlp.pkl', 'wb') as file:
    pickle.dump(search.best_estimator_, file)