In [16]:
from dataloader import MinibooneLoader
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

shuffleSeed = 0     # Data shuffle seed.
kFoldSeed = None    # Cross-validation shuffle seed.
targetCores = 8     # How many cores to use.
trainSize = 0.8
testSize = 0.2

In [3]:
dataset = MinibooneLoader().loadMiniboone()

events = dataset.events

In [4]:
# Set up a preprocessing pipeline
estimators = [
    # Standardizes and scales the dataset
    ['scale', StandardScaler()]
]
pipe = Pipeline(estimators).fit(events)

# Transform the data using the prepared pipeline.
stdEvents = pipe.transform(events)

In [5]:

# Shuffle the dataset and split into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(stdEvents, dataset.classifications, shuffle=True, random_state=shuffleSeed, train_size=trainSize, test_size=testSize)

print('Data has been split. Training data:')
print(X_train)
print(y_train)

Data has been split. Training data:
[[ 0.04852549  0.05710245 -0.42166557 ...  0.04230077  0.08020339
   0.0614591 ]
 [ 0.08522973  0.05787857  0.40372423 ...  0.15819105  0.06338857
   0.0602966 ]
 [ 0.03886527  0.05797087 -0.56084576 ...  0.03024111  0.03142102
   0.06056402]
 ...
 [ 0.06007261  0.07552737  0.02669232 ...  0.06275209  0.08296288
   0.06197942]
 [ 0.06221417  0.06761905 -0.02986604 ...  0.07141983  0.07571434
   0.05977339]
 [ 0.06484492  0.0546634   0.00282108 ...  0.0365456   0.03887806
   0.0591355 ]]
[0 1 0 ... 1 1 1]


In [6]:
# Get a basic model fitted and running.
layersConfig = (10, 10, 10)
maxIterations = 1000

print('Training and fitting neural network')
mlp = MLPClassifier(hidden_layer_sizes=layersConfig, max_iter=maxIterations, verbose=True)
mlp.fit(X_train, y_train.ravel())
print('\nTraining complete\n')

print('Classifying test data')
predictions = mlp.predict(X_test)

print('\nConfusion matrix:')
print(confusion_matrix(y_test, predictions))

print('\nClassification report:')
print(classification_report(y_test, predictions))


Training and fitting neural network
Iteration 1, loss = 0.35743086
Iteration 2, loss = 0.25600091
Iteration 3, loss = 0.23873591
Iteration 4, loss = 0.22983388
Iteration 5, loss = 0.22342818
Iteration 6, loss = 0.21887494
Iteration 7, loss = 0.21464753
Iteration 8, loss = 0.21060651
Iteration 9, loss = 0.20749621
Iteration 10, loss = 0.20381066
Iteration 11, loss = 0.20128109
Iteration 12, loss = 0.19843818
Iteration 13, loss = 0.19629351
Iteration 14, loss = 0.19419289
Iteration 15, loss = 0.19257516
Iteration 16, loss = 0.19070329
Iteration 17, loss = 0.18892750
Iteration 18, loss = 0.18778278
Iteration 19, loss = 0.18680701
Iteration 20, loss = 0.18619713
Iteration 21, loss = 0.18523325
Iteration 22, loss = 0.18399876
Iteration 23, loss = 0.18361096
Iteration 24, loss = 0.18250799
Iteration 25, loss = 0.18202332
Iteration 26, loss = 0.18169815
Iteration 27, loss = 0.18111529
Iteration 28, loss = 0.18039229
Iteration 29, loss = 0.17979410
Iteration 30, loss = 0.17925353
Iteration 31,

In [17]:
# Perform grid search to identify the best hyperparameters

# Define our estimator
mlp=MLPClassifier(hidden_layer_sizes=layersConfig, max_iter=maxIterations)

# Define our hyperameter grid-space.
parameters={
    'hidden_layer_sizes': (
        (10, 10, 10),
        (5, 5, 5),
        (25, 25),
        (10, 10),
        (5, 5),
    ),
    'max_iter': (
        1000,
        500,
        250,
        100
    )
}

# Set up a basic 5-fold cross-validation.
cv = KFold(n_splits=5, shuffle=False, random_state=kFoldSeed)
# Set up the final grid-search
search = GridSearchCV(estimator=mlp, param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=targetCores, verbose=20)

# TODO: Refactor MLP estimator to generic object (avoid two instances)
# TODO: Finish cross-validation implementation.

result = search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   58.7s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:  1.8min
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:  2.9min
[Parallel

In [18]:
# Print the results of the above grid search
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9369059324820619
Best Hyperparameters: {'hidden_layer_sizes': (25, 25), 'max_iter': 250}
