# Classify
* Load the feature matrix prepared in previous notebook
* Split the data into train/validate/test datasets
* Compute a baseline
* Classify using Support Vector Machine


In [28]:
import numpy as np
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import random

Load the feature matrix prepared in previous notebook

In [2]:
DATA_PATH = 'data'

with np.load(join(DATA_PATH, 'processed_data.npz')) as data:
    data_dict=dict(data.items())
X = data_dict['X']
y = data_dict['y']

Split the data into train/validate/test datasets

In [3]:
# Split the data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)
print('The training dataset has {} entries, the test dataset {}.'.format(
    len(X_train), len(X_test)))

The training dataset has 11532 entries, the test dataset 3844.


## Compute a baseline

In [4]:
# Create the dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Fit it to the train data
dummy.fit(X_train, y_train)

# Compute accuracy on the test set
accuracy = dummy.score(X_test, y_test)
print('Baseline accuracy: {:.3f}'.format(accuracy))

Baseline accuracy: 0.155


## SVM
Create a pipeline using a standard scaler and a support vector machine (SVM) model.

In [24]:
# Create SVM
svc = SVC(kernel='rbf', C=1, gamma='scale')

pipe_svc = Pipeline([
    ('scaler', StandardScaler()), # More accurate with standardization?
    #('scaler', None), # Better performance (faster) without standardization?
    ('svc', svc)
])

### Tune the SVM model
Tune C (regularization) parameter (skipped gamma set to 'scale' by default), using grid search

In [25]:
%%time
# Create cross-validation object
grid_svc = {
    'svc__C': [0.75, 1, 2.5, 5, 7.5, 10],
}
# create grid search with cross validation
gridsearch_svc = GridSearchCV(pipe_svc, grid_svc, cv=4, return_train_score=True, verbose=3, n_jobs=4)

# Get a smaller random sample for tuning the parameter
NUM_SAMPLES = 1000
idx = random.sample(range(len(X_train)-1), NUM_SAMPLES)
X_rnd = X_train[idx]
y_rnd = y_train[idx]

# Fit estimator
gridsearch_svc.fit(X_rnd, y_rnd)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
CPU times: user 1min 45s, sys: 391 ms, total: 1min 45s
Wall time: 5min 11s


[CV 1/4] END ...svc__C=0.75;, score=(train=0.561, test=0.192) total time=  23.7s
[CV 3/4] END ......svc__C=1;, score=(train=0.647, test=0.244) total time=  23.9s
[CV 1/4] END ......svc__C=5;, score=(train=0.979, test=0.196) total time=  20.1s
[CV 2/4] END ......svc__C=5;, score=(train=0.979, test=0.200) total time=  21.7s
[CV 2/4] END ....svc__C=7.5;, score=(train=0.988, test=0.208) total time=  17.7s
[CV 2/4] END .....svc__C=10;, score=(train=0.992, test=0.192) total time=  25.7s
[CV 3/4] END ...svc__C=0.75;, score=(train=0.556, test=0.200) total time=  22.9s
[CV 3/4] END ......svc__C=1;, score=(train=0.651, test=0.224) total time=  21.0s
[CV 2/4] END ....svc__C=2.5;, score=(train=0.908, test=0.204) total time=  29.3s
[CV 2/4] END ......svc__C=5;, score=(train=0.973, test=0.180) total time=  27.3s
[CV 2/4] END ....svc__C=7.5;, score=(train=0.985, test=0.184) total time=  22.9s
[CV 1/4] END .....svc__C=10;, score=(train=0.992, test=0.232) total time=  19.4s
[CV 3/4] END ...svc__C=0.75;

In [26]:
# Collect results in a DataFrame
results_svc = pd.DataFrame(gridsearch_svc.cv_results_)

# Print a few interesting columns
cols = ['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score', 'param_svc__C']
sorted_svc = results_svc[cols].sort_values('mean_test_score', ascending=False)
sorted_svc.head(10)

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,param_svc__C
1,0.225,0.00995,0.658,0.005696,1.0
0,0.224,0.018547,0.550667,0.008219,0.75
2,0.221,0.024062,0.905333,0.009428,2.5
3,0.205,0.031796,0.975667,0.001453,5.0
5,0.197,0.03167,0.993667,0.001106,10.0
4,0.197,0.035369,0.987,0.002186,7.5


Compute accuracy of support vector machine classifier on test dataset

In [27]:
# get best parameter from result of grid search
C = sorted_svc.iloc[0]['param_svc__C']

# set parameter in pipeline
pipe_svc.set_params(svc__C=C)

# fit the model
pipe_svc.fit(X_train, y_train)

# compute accuracy on test dataset
acc_test_svc = pipe_svc.score(X_test, y_test)
acc_test_svc

0.27601456815816855

## Multilayer perceptron classifier

Build a MLP classifier with the defaults and early stopping.

In [29]:
# create neural network
mlp = MLPClassifier(
    random_state=0,
    early_stopping=True, n_iter_no_change=5, tol=0.01,
    verbose=False
)

pipe_mlp = Pipeline([
    ('scaler', StandardScaler()), # More accurate with standardization
    #('scaler', None), # Better performance (faster) without standardization
    ('mlp', mlp)
])

Optimize the model's parameters using grid search

In [30]:
%%time
from sklearn.model_selection import GridSearchCV

# Create cross-validation object
grid_mlp = {
    'mlp__solver': ['lbfgs', 'sgd'],
    'mlp__hidden_layer_sizes': [(288, 72), (144, 72), (288, 144), 72, 144, 288],
    'mlp__activation': ['relu'],
    'mlp__alpha': [0.0001, 0.00001]
}
# create grid search with cross validation
gridsearch_mlp = GridSearchCV(pipe_mlp, grid_mlp, cv=2, return_train_score=True, verbose=1, n_jobs=4)

# Get a smaller random sample for tuning the parameter
NUM_SAMPLES = 1000
idx = random.sample(range(len(X_train)-1), NUM_SAMPLES)
X_rnd = X_train[idx]
y_rnd = y_train[idx]

# Fit estimator
gridsearch_mlp.fit(X_rnd, y_rnd)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


24 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/home/atroncos/workspace/oa_venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/atroncos/workspace/oa_venv/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/atroncos/workspace/oa_venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/

CPU times: user 9min 30s, sys: 664 ms, total: 9min 31s
Wall time: 6min 23s


In [31]:
# Collect results in a DataFrame
results_mlp = pd.DataFrame(gridsearch_mlp.cv_results_)

# Print a few interesting columns
cols = ['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score', 
        'param_mlp__solver', 'param_mlp__hidden_layer_sizes', 'param_mlp__activation', 'param_mlp__alpha']
sorted_mlp = results_mlp[cols].sort_values('mean_test_score', ascending=False)
sorted_mlp.head(10)

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,param_mlp__solver,param_mlp__hidden_layer_sizes,param_mlp__activation,param_mlp__alpha
18,0.164,0.016,1.0,0.0,lbfgs,72,relu,1e-05
0,0.162,0.006,1.0,0.0,lbfgs,"(288, 72)",relu,0.0001
10,0.162,0.026,1.0,0.0,lbfgs,288,relu,0.0001
22,0.162,0.026,1.0,0.0,lbfgs,288,relu,1e-05
12,0.162,0.006,1.0,0.0,lbfgs,"(288, 72)",relu,1e-05
6,0.162,0.014,1.0,0.0,lbfgs,72,relu,0.0001
14,0.161,0.001,1.0,0.0,lbfgs,"(144, 72)",relu,1e-05
20,0.159,0.007,1.0,0.0,lbfgs,144,relu,1e-05
4,0.158,0.002,1.0,0.0,lbfgs,"(288, 144)",relu,0.0001
2,0.158,0.004,1.0,0.0,lbfgs,"(144, 72)",relu,0.0001


In [33]:
# get best parameters from result of grid search
param_mlp__solver = sorted_mlp.iloc[0]['param_mlp__solver']
param_mlp__hidden_layer_sizes = sorted_mlp.iloc[0]['param_mlp__hidden_layer_sizes']
param_mlp__activation = sorted_mlp.iloc[0]['param_mlp__activation']
param_mlp__alpha = sorted_mlp.iloc[0]['param_mlp__alpha']

# set parameters in mlp classifier pipeline
pipe_mlp.set_params(mlp__solver=param_mlp__solver)
pipe_mlp.set_params(mlp__hidden_layer_sizes=param_mlp__hidden_layer_sizes)
pipe_mlp.set_params(mlp__activation=param_mlp__activation)
pipe_mlp.set_params(mlp__alpha=param_mlp__alpha)
pipe_mlp.set_params(mlp__max_iter=400)  # default is 200

# fit the model
pipe_mlp.fit(X_train, y_train)

# compute accuracy on test dataset
acc_test_mlp = pipe_mlp.score(X_test, y_test)

In [34]:
acc_test_mlp

0.19302809573361082