In [1]:
import h5py as h5
import numpy as np

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Grab extracted features and labels (TRAIN SET).
with h5.File('features_train.h5', 'r') as file:
    X_train, y_train = file['images'][:], file['labels'][:]
    classes = file['classes'][:]

In [3]:
# Params used to find the best estimator.
params = {
    'C': [0.0001, 0.001, 0.01, 0.01, 1.0, 10.0, 100.0],
}

model = GridSearchCV(
    LogisticRegression(
        solver='lbfgs',
        multi_class='auto',
        max_iter=1000,
    ),
    param_grid=params,
    cv=3,
    n_jobs=-1, # -1: all cores
    verbose=2,
)

In [4]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   12.1s remaining:   16.2s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   19.3s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.01, 1.0, 10.0, 100.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [5]:
print('Best estimator is', model.best_estimator_)
print('Best score is', model.best_score_)

Best estimator is LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best score is 0.9216867469879517


In [6]:
best_model = model.best_estimator_

best_model.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
# Grab extracted features and labels (TEST SET)
with h5.File('features_test.h5', 'r') as file:
    X_test, y_test = file['images'][:], file['labels'][:]

In [8]:
y_hat = best_model.predict(X_test)

In [9]:
confusion_matrix = classification_report(
    y_test,
    y_hat,
    target_names=classes
)
print(confusion_matrix)

              precision    recall  f1-score   support

     hot_dog       0.95      0.90      0.92       250
 not_hot_dog       0.90      0.96      0.93       250

    accuracy                           0.93       500
   macro avg       0.93      0.93      0.93       500
weighted avg       0.93      0.93      0.93       500



In [10]:
acc = accuracy_score(y_test, y_hat)

f'Score is {acc}.'

'Score is 0.926.'