In [1]:
import itertools
import random
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.model_selection import GridSearchCV
from torch import manual_seed
from torch.optim import Adam
from torch.nn.modules.loss import BCEWithLogitsLoss
from skorch import NeuralNetBinaryClassifier
from utils import Encoder, Model

SEED = 29
random.seed(SEED)
manual_seed(SEED)
np.random.seed(SEED)


In [2]:
data = fetch_openml(name="Lending-Club-Loan-Data", version=1,
                    as_frame=True, data_home='data', parser='auto')
frame = data.frame.copy()
print("Shape of the data: ", data.frame.shape)


Shape of the data:  (9578, 14)


In [3]:
y = frame['not.fully.paid'].astype(np.float32)
frame.drop('not.fully.paid', axis=1, inplace=True)
qualitative = ['purpose']
categories = list(itertools.chain.from_iterable((var + '_' + str(value)
                                                 for value in np.unique(frame[var].dropna()))
                                                for var in qualitative))
categories


['purpose_all_other',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_major_purchase',
 'purpose_small_business']

In [4]:
classifier = NeuralNetBinaryClassifier(
    Model,
    module__input_size=frame.shape[1],
    criterion=BCEWithLogitsLoss,
    optimizer=Adam,
    iterator_train__shuffle=True,
    max_epochs=200)


# deactivate skorch-internal train-valid split and verbose logging
classifier.set_params(train_split=False, verbose=0)

pipe = Pipeline([
    ('encoder', Encoder(categories, qualitative)),
    ('scale', MinMaxScaler()),
    ('classifier', classifier),
])

params = {
    'classifier__lr': [0.01, 0.005, 0.001],
    'classifier__module__num_units': [32, 64, 90],
    'classifier__batch_size': [32, 64, 128]
}

grid_search = GridSearchCV(pipe, params, refit=True,
                           cv=3, scoring='accuracy', verbose=0)

grid_result = grid_search.fit(frame, y)

print("Best mean test score: {:.3f}, Best std test score: {:.3f}, Best params: {}".format(
    grid_search.best_score_, grid_search.cv_results_['std_test_score'][grid_search.best_index_], grid_search.best_params_))


Best mean test score: 0.838, Best std test score: 0.003, Best params: {'classifier__batch_size': 32, 'classifier__lr': 0.01, 'classifier__module__num_units': 32}
