In [1]:
import scipy.io as sio
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import sys
sys.path.append('../')
import utils.reports as rp

In [2]:
# Loads dataset
dataset = sio.loadmat('../../datasets/classification/ionosphere.mat')

In [3]:
# Prints dataset description
dataset['Description']

array(['Ionosphere dataset from the UCI machine learning repository:                   ',
       'http://archive.ics.uci.edu/ml/datasets/Ionosphere                              ',
       'X is a 351x34 real-valued matrix of predictors. Y is a categorical response:   ',
       '"b" for bad radar returns and "g" for good radar returns.                      ',
       'This is a binary classification problem.                                       '],
      dtype='<U79')

In [4]:
# Prepares data for classification
predictors = dataset['X'][:,2:] # the first two columns are useless
positive = (dataset['Y'] == 'g').flatten()
responses = np.zeros(predictors.shape[0])
responses[positive] = 1

In [5]:
# Splits dataset into train/test
X,X_holdout,y,y_holdout = train_test_split(predictors,responses,test_size = 0.3,random_state = 0)

In [6]:
# Creates pipeline
clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=300)
sc = StandardScaler()
estimators = [('normalizer', sc), ('classifier', clf)]
pipe = Pipeline(estimators)
pipe

Pipeline(memory=None,
         steps=[('normalizer',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=300,
                                    multi_class='auto', n_jobs=None,
                                    penalty='elasticnet', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [7]:
hyperparams = [{
    'classifier__C': np.linspace(0.001,1,25),
    'classifier__l1_ratio': np.linspace(0,1,10)
}]

In [8]:
validator = GridSearchCV(pipe,cv=5, param_grid=hyperparams, scoring='accuracy', n_jobs=-1,verbose = 1,iid = False)
results = validator.fit(X,y)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 920 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1227 out of 1250 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed:    4.9s finished


In [9]:
rp.report_grid_search(validator.cv_results_)

Model with rank: 1
Mean validation score: 0.878 (std: 0.029)
Parameters: {'classifier__C': 0.209125, 'classifier__l1_ratio': 0.0}

Model with rank: 2
Mean validation score: 0.873 (std: 0.033)
Parameters: {'classifier__C': 0.292375, 'classifier__l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.873 (std: 0.030)
Parameters: {'classifier__C': 0.1675, 'classifier__l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.873 (std: 0.030)
Parameters: {'classifier__C': 0.41725, 'classifier__l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.873 (std: 0.030)
Parameters: {'classifier__C': 0.45887500000000003, 'classifier__l1_ratio': 0.0}



In [10]:
# Perform evaluation in the holdout set
y_pred = validator.predict(X_holdout)
rp.report_classification(y_pred, y_holdout)

Test (Metrics): 

Accuracy:  0.86
F1 Score:  0.89
Recall:  0.81
Precision:  0.98

Confusion Matrix:
 [[30  1]
 [14 61]]
