In [2]:
import time
from dataloader import MinibooneLoader
from helpers import testAlgorithm, fitGridSearch
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Performance/Optimizations
EnableDaal4py = True    # Optimise sklearn for Intel CPUs (Requires daal4py)

shuffleSeed = 0     # Data shuffle seed.
kFoldSeed = None    # Cross-validation shuffle seed.
targetCores = 8     # How many cores to use.
trainSize = 0.8
testSize = 0.2

# Set up a basic 5-fold cross-validation.
cv = KFold(n_splits=5, shuffle=True, random_state=kFoldSeed)

if (EnableDaal4py == True):
    from daal4py.sklearn import patch_sklearn
    patch_sklearn()

Intel(R) oneAPI Data Analytics Library solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


In [3]:
dataset = MinibooneLoader().loadMiniboone()

events = dataset.events

In [4]:
# Set up a preprocessing pipeline
estimators = [
    # Standardizes and scales the dataset
    ['scale', StandardScaler()]
]
pipe = Pipeline(estimators).fit(events)

# Transform the data using the prepared pipeline.
X = pipe.transform(events)
y = dataset.classifications

In [5]:

# Shuffle the dataset and split into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=shuffleSeed, train_size=trainSize, test_size=testSize)

print('Data has been split. Training data:')
print(X_train)
print(y_train)

Data has been split. Training data:
[[ 0.04852549  0.05710245 -0.42166557 ...  0.04230077  0.08020339
   0.0614591 ]
 [ 0.08522973  0.05787857  0.40372423 ...  0.15819105  0.06338857
   0.0602966 ]
 [ 0.03886527  0.05797087 -0.56084576 ...  0.03024111  0.03142102
   0.06056402]
 ...
 [ 0.06007261  0.07552737  0.02669232 ...  0.06275209  0.08296288
   0.06197942]
 [ 0.06221417  0.06761905 -0.02986604 ...  0.07141983  0.07571434
   0.05977339]
 [ 0.06484492  0.0546634   0.00282108 ...  0.0365456   0.03887806
   0.0591355 ]]
[0 1 0 ... 1 1 1]


In [17]:
# Define our hyperameter grid-space.
parameters={
    'hidden_layer_sizes': (
        # (10, 10, 10),
        (5, 5, 5),
        (25, 25),
        # (10, 10),
        (5, 5),
        (25)
    ),
    'max_iter': ( 1000, 500, 250 ),
    'activation': (
        'tanh',
        'relu'
    ),
    'n_iter_no_change': (
        10,
        20
    )
}

# Define our estimator
mlp=MLPClassifier()
# Set up the final grid-search
search = GridSearchCV(estimator=mlp, param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=targetCores, verbose=20)

fitGridSearch(search, X_train, y_train)
testAlgorithm(search, X_test, y_test)

 elapsed:  1.8min
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:  3.6min
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:  4.4min
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed:  5.0min
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  5.7min
[Parallel(n_jobs=8)]: Done  17 tasks      | elapsed:  5.7min
[Parallel(n_jobs=8)]: Done  18 tasks      | elapsed:  6.0min
[Paral

In [41]:
# NOTE: loss not set to hinge. Causes NaN accuracy and doesn't run.
# Define our hyperameter grid-space.
parameters={
    'penalty': ('l1', 'l2'),
    'dual': (True, False),
    'tol': (1e-4, 1e-5, 5e-5),
    'class_weight': (None, 'balanced'),
    'max_iter': (1000, 2000, 500, 250, 100)
}

# Define our SVM. LinearSVC is used in this case since it is more optimized for larger datasets than SVC.
svm=LinearSVC()
# Set up the final grid-search
search = GridSearchCV(estimator=svm, param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=targetCores, verbose=20)

fitGridSearch(search, X_train, y_train)
testAlgorithm(search, X_test, y_test)

n_jobs=8)]: Done 485 tasks      | elapsed: 32.1min
[Parallel(n_jobs=8)]: Done 486 tasks      | elapsed: 32.2min
[Parallel(n_jobs=8)]: Done 487 tasks      | elapsed: 32.4min
[Parallel(n_jobs=8)]: Done 488 tasks      | elapsed: 32.4min
[Parallel(n_jobs=8)]: Done 489 tasks      | elapsed: 33.4min
[Parallel(n_jobs=8)]: Done 490 tasks      | elapsed: 33.8min
[Parallel(n_jobs=8)]: Done 491 tasks      | elapsed: 33.9min
[Parallel(n_jobs=8)]: Done 492 tasks      | elapsed: 34.0min
[Parallel(n_jobs=8)]: Done 493 tasks      | elapsed: 34.0min
[Parallel(n_jobs=8)]: Done 494 tasks      | elapsed: 34.1min
[Parallel(n_jobs=8)]: Done 495 tasks      | elapsed: 34.1min
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed: 34.2min
[Parallel(n_jobs=8)]: Done 497 tasks      | elapsed: 34.3min
[Parallel(n_jobs=8)]: Done 498 tasks      | elapsed: 34.4min
[Parallel(n_jobs=8)]: Done 499 tasks      | elapsed: 34.4min
[Parallel(n_jobs=8)]: Done 500 tasks      | elapsed: 34.4min
[Parallel(n_jobs=8)]: Done 501 tas

In [8]:
# Perform grid search to identify the best hyperparameters

# NOTE: loss not set to hinge. Causes NaN accuracy and doesn't run.
# Define our hyperameter grid-space.
parameters={
    'n_estimators': (10, 100, 250),
    'criterion': ('gini', 'entropy'),
    'min_samples_split': (2, 5),
    'min_samples_leaf': (2, 5),
    'class_weight': ('balanced', 'balanced_subsample', None),
    'max_features': ('auto', 'log2'),
    'oob_score': (False, True)
}

# Define our SVM. LinearSVC is used in this case since it is more optimized for larger datasets than SVC.
rf=RandomForestClassifier()
# Set up the final grid-search
search = GridSearchCV(estimator=rf, param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=targetCores, verbose=20)

fitGridSearch(search, X_train, y_train)
testAlgorithm(search, X_test, y_test)

'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 250, 'oob_score': True}
0.926 (+/-0.003) for {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 10, 'oob_score': False}
0.926 (+/-0.004) for {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 10, 'oob_score': True}
0.934 (+/-0.003) for {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100, 'oob_score': False}
0.933 (+/-0.003) for {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100, 'oob_score': True}
0.933 (+/-0.003) for {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', '