In [1]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
% load_ext autoreload
% autoreload 2
from functools import partial

import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

from vflow import ModuleSet, init_args  # must install pcsp first (pip install pcsp)

np.set_printoptions(threshold=5)  # to limit printing

In [2]:
np.random.seed(13)

X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = init_args(train_test_split(X, y, random_state=42),
                                             names=['X_train', 'X_test', 'X_train',
                                                    'X_test'])  # odd that the names for y_train and y_test must be "X_train" and "X_test"

subsampling_funcs = [partial(resample,
                             n_samples=20,
                             random_state=i)
                     for i in range(3)]

subsampling_set = ModuleSet(name='subsampling', modules=subsampling_funcs)
X_trains, y_trains = subsampling_set(X_train, y_train)

# fit models
modeling_set = ModuleSet(name='modeling',
                         modules=[LogisticRegression(C=1, max_iter=1000, tol=0.1),
                                  DecisionTreeClassifier(min_samples_leaf=1)],
                         module_keys=["LR", "DT"])

_ = modeling_set.fit(X_trains, y_trains)

# predict now returns modeling_set.output rather than the result of sep_dicts(output_dict)
preds_test = modeling_set.predict(X_test)

hard_metrics_set = ModuleSet(name='hard_metrics',
                             modules=[accuracy_score, balanced_accuracy_score],
                             module_keys=["Acc", "Bal_Acc"])

hard_metrics = hard_metrics_set.evaluate(y_test, preds_test)
hard_metrics

{('X_test',
  ('X_train', 'X_train', 'subsampling_0', 'LR', 'X_test'),
  'Acc'): 0.9230769230769231,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_0', 'LR', 'X_test'),
  'Bal_Acc'): 0.9,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_0', 'DT', 'X_test'),
  'Acc'): 0.7692307692307693,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_0', 'DT', 'X_test'),
  'Bal_Acc'): 0.7375,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_1', 'LR', 'X_test'),
  'Acc'): 0.9230769230769231,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_1', 'LR', 'X_test'),
  'Bal_Acc'): 0.9,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_1', 'DT', 'X_test'),
  'Acc'): 0.8461538461538461,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_1', 'DT', 'X_test'),
  'Bal_Acc'): 0.8,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_2', 'LR', 'X_test'),
  'Acc'): 0.8461538461538461,
 ('X_test',
  ('X_train', 'X_train', 'subsampling_2', 'LR', 'X_test'),
  'Bal_Acc'): 0.8375,
 ('X_test',
  ('X_train', 'X_train', 's

In [15]:
def train(C=1, min_samples_leaf=1):
    np.random.seed(13)

    X, y = make_classification(n_samples=50, n_features=5)
    X_train, X_test, y_train, y_test = init_args(train_test_split(X, y, random_state=42),
                                                 names=['X_train', 'X_test', 'X_train', 'X_test'])

    # fit models
    modeling_set = ModuleSet(name='modeling',
                             modules=[LogisticRegression(max_iter=1000, tol=0.1),
                                      DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)],
                             module_keys=["LR", "DT"])
    _ = modeling_set.fit(X_train, y_train)

    preds_test = modeling_set.predict(X_test)

    hard_metrics_set = ModuleSet(name='hard_metrics',
                                 modules=[accuracy_score, balanced_accuracy_score],
                                 module_keys=["Acc", "Bal_Acc"])

    hard_metrics = hard_metrics_set.evaluate(y_test, preds_test)

    for k, v in modeling_set.out.items():
        if k != '__prev__':
            with mlflow.start_run():
                # log parameters
                mlflow.log_param("C", C)
                mlflow.log_param("min_samples_leaf", min_samples_leaf)
                mlflow.log_param("Model_name", k[1])

                # log metrics
                mlflow.log_metric("Acc", hard_metrics[('X_test', (*k, 'X_test'), 'Acc')])
                mlflow.log_metric("Bal_Acc", hard_metrics[('X_test', (*k, 'X_test'), 'Bal_Acc')])

                # log model
                mlflow.sklearn.log_model(v, str(k))

In [16]:
for C in [0.1 * i for i in range(1, 11)]:
    for min_samples_leaf in range(1, 4):
        train(C, min_samples_leaf)

('X_train', 'X_train', 'LR') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'LR', '3')
True
('X_train', 'X_train', 'DT') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'DT', '3')
False
('X_train', 'X_train', 'LR') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'LR', '

('X_train', 'X_train', 'DT') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'DT', '3')
False
('X_train', 'X_train', 'LR') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'LR', '3')
True
('X_train', 'X_train', 'DT') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'DT', '

('X_train', 'X_train', 'LR') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'LR', '3')
True
('X_train', 'X_train', 'DT') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'DT', '3')
False
('X_train', 'X_train', 'LR') dict_keys([('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'LR', 'X_test'), 'Bal_Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Acc'), ('X_test', ('X_train', 'X_train', 'DT', 'X_test'), 'Bal_Acc'), '__prev__']) ('X_train', 'X_train', 'LR', '

In [5]:
!mlflow ui

[2021-09-06 15:42:15 -0700] [42936] [INFO] Starting gunicorn 20.1.0
[2021-09-06 15:42:15 -0700] [42936] [INFO] Listening at: http://127.0.0.1:5000 (42936)
[2021-09-06 15:42:15 -0700] [42936] [INFO] Using worker: sync
[2021-09-06 15:42:15 -0700] [42939] [INFO] Booting worker with pid: 42939
