In [49]:
import argparse

import h5py as h5
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Creme-ML
from creme import optim
from creme.linear_model import LogisticRegression
from creme.multiclass import OneVsRestClassifier
from creme.preprocessing import StandardScaler
from creme.compose import Pipeline
from creme import metrics
from creme import stream
from creme import compat
from creme import model_selection
from creme import ensemble

from h5imagegenerator import HDF5ImageGenerator

In [50]:
np.random.seed(42)

### Logistic Regression

In [51]:
def build_h5_stream(src, *, X_key, y_key):
    """HDF Generator Factory
    
    Parameters
    ----------
    src : str
        HDF source file path
    X_key : str
        HDF features dataset key
    y_key : str
        HDF labels dataset key
        
    Notes
    -----
    X_key and y_key must be passed as
    keyword args only.
        
    Returns
    -------
    generator
        A generator that yields one
        observation at a time.
    """
    # Creme-ml works with dict instead of ndarrays
    # Each feature has a column name
    with h5.File(src, 'r') as source:
        columns = [f'feat_{feature_idx}'
                   for feature_idx
                   in range(source[X_key].shape[1])]

    def streamer():
        """HDF Generator"""
        cursor = 0

        while True:
            with h5.File(src, 'r') as source:                     
                try:
                    X_train = source[X_key][cursor]
                    y_train = source[y_key][cursor]
                except (ValueError, IndexError):
                    raise StopIteration('No more observations.')

            cursor += 1

            yield dict(zip(columns, X_train)), y_train
            
    return streamer()


stream = build_h5_stream(
    'features_train.h5',
    X_key='images',
    y_key='labels',
)

In [52]:
optim = optim.SGD(lr=0.01)

model = ensemble.AdaBoostClassifier(
    model=(
        LogisticRegression(optimizer=optim)
    ),
    n_models=3,
)

model

AdaBoostClassifier (
  model=LogisticRegression (
    optimizer=SGD (
      lr=Constant (
        learning_rate=0.01
      )
    )
    loss=Log ()
    l2=0.
    intercept=0.
    intercept_lr=Constant (
      learning_rate=0.01
    )
    clip_gradient=1e+12
    initializer=Zeros ()
  )
  n_models=3
  seed=None
)

In [53]:
metric = metrics.Accuracy()

In [54]:
cm = metrics.ConfusionMatrix()

In [55]:
model_selection.progressive_val_score(stream, model, metric, print_every=10)

[10] Accuracy: 30.00%
[20] Accuracy: 50.00%
[30] Accuracy: 63.33%
[40] Accuracy: 72.50%
[50] Accuracy: 74.00%
[60] Accuracy: 76.67%
[70] Accuracy: 78.57%
[80] Accuracy: 81.25%
[90] Accuracy: 81.11%
[100] Accuracy: 82.00%
[110] Accuracy: 81.82%
[120] Accuracy: 83.33%
[130] Accuracy: 83.08%
[140] Accuracy: 82.14%
[150] Accuracy: 81.33%
[160] Accuracy: 82.50%
[170] Accuracy: 83.53%
[180] Accuracy: 82.78%
[190] Accuracy: 83.16%
[200] Accuracy: 83.50%
[210] Accuracy: 83.81%
[220] Accuracy: 84.55%
[230] Accuracy: 85.22%
[240] Accuracy: 85.42%
[250] Accuracy: 85.20%
[260] Accuracy: 85.00%
[270] Accuracy: 85.19%
[280] Accuracy: 85.00%
[290] Accuracy: 84.48%
[300] Accuracy: 85.00%
[310] Accuracy: 84.52%
[320] Accuracy: 84.37%
[330] Accuracy: 84.55%
[340] Accuracy: 85.00%
[350] Accuracy: 85.14%
[360] Accuracy: 85.00%
[370] Accuracy: 84.86%
[380] Accuracy: 84.74%
[390] Accuracy: 84.87%
[400] Accuracy: 84.25%
[410] Accuracy: 84.15%
[420] Accuracy: 84.29%
[430] Accuracy: 84.42%
[440] Accuracy: 84.3

Accuracy: 84.74%

In [None]:
"""for X, y in stream:
    y_hat = model.predict_one(X)
    model = model.fit_one(X, y)

    # Update the metrics
    metric = metric.update(y_true=y, y_pred=y_hat)
    cm = cm.update(y_true=y, y_pred=y_hat)"""

### Cross-Validation Score

In [None]:
metric

In [35]:
cm

         False      1
      0    219     30
      1     32    217