In [2]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
from functools import partial

import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

from vflow import Vset, init_args  # must install pcsp first (pip install pcsp)

np.set_printoptions(threshold=5)  # to limit printing

In [None]:
np.random.seed(13)

X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = init_args(train_test_split(X, y, random_state=42),
                                             names=['X_train', 'X_test', 'X_train',
                                                    'X_test'])

subsampling_funcs = [partial(resample,
                             n_samples=20,
                             random_state=i)
                     for i in range(3)]

subsampling_set = Vset(name='subsampling', modules=subsampling_funcs)
X_trains, y_trains = subsampling_set(X_train, y_train)

# fit models
modeling_set = Vset(name='modeling',
                    modules=[LogisticRegression(C=1, max_iter=1000, tol=0.1),
                             DecisionTreeClassifier(min_samples_leaf=1)],
                    module_keys=["LR", "DT"])

_ = modeling_set.fit(X_trains, y_trains)

# predict now returns modeling_set.output rather than the result of sep_dicts(output_dict)
preds_test = modeling_set.predict(X_test)

hard_metrics_set = Vset(name='hard_metrics',
                        modules=[accuracy_score, balanced_accuracy_score],
                        module_keys=["Acc", "Bal_Acc"],
                        tracking_dir='./mlruns')

hard_metrics = hard_metrics_set.evaluate(y_test, preds_test)

> [0;32m/home/james/Dropbox/repos/pcs-pipeline/vflow/vset.py[0m(94)[0;36m_apply_func[0;34m()[0m
[0;32m     92 [0;31m            [0;31m# log smart subkeys as params and value as metric[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     93 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 94 [0;31m            [0;32mfor[0m [0mk[0m[0;34m,[0m [0mv[0m [0;32min[0m [0mout_dict[0m[0;34m.[0m[0mitems[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     95 [0;31m                [0morigins[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0;34m[[0m[0msubk[0m[0;34m.[0m[0morigin[0m [0;32mfor[0m [0msubk[0m [0;32min[0m [0mk[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m                [0;31m# ignore init origins and the last origin (this Vset)[0m[0;34m[0m[0;3

ipdb>  out_dict.items()


dict_items([((X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, LR, Acc), 0.9230769230769231), ((X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, LR, Acc), 0.07692307692307693), ((X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, LR, Acc), 0.8461538461538461), ((X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, LR, Acc), 0.3076923076923077), ((X_test, X_test, X_train, subsampling_1, X_train, subsampling_1, LR, Acc), 0.9230769230769231), ((X_test, X_test, X_train, subsampling_1, X_train, subsampling_2, LR, Acc), 0.15384615384615385), ((X_test, X_test, X_train, subsampling_2, X_train, subsampling_0, LR, Acc), 0.6923076923076923), ((X_test, X_test, X_train, subsampling_2, X_train, subsampling_1, LR, Acc), 0.6153846153846154), ((X_test, X_test, X_train, subsampling_2, X_train, subsampling_2, LR, Acc), 0.8461538461538461), ((X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, DT, Acc), 0.7692307692307693), ((X_test, X_test

ipdb>  out_dict.keys()


dict_keys([(X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, DT, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, DT, Acc), (X_test, X_tes

ipdb>  tuple(out_dict.keys())


((X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, DT, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, DT, Acc), (X_test, X_test, X_train

ipdb>  keys = list(out_dict.keys())
ipdb>  keys


[(X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_0, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_1, LR, Acc), (X_test, X_test, X_train, subsampling_2, X_train, subsampling_2, LR, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_0, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_1, DT, Acc), (X_test, X_test, X_train, subsampling_0, X_train, subsampling_2, DT, Acc), (X_test, X_test, X_train, subsampling_1, X_train, subsampling_0, DT, Acc), (X_test, X_test, X_train

In [3]:
!mlflow ui

[2021-10-11 22:11:25 -0700] [93263] [INFO] Starting gunicorn 20.0.4
[2021-10-11 22:11:25 -0700] [93263] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2021-10-11 22:11:25 -0700] [93263] [ERROR] Retrying in 1 second.
[2021-10-11 22:11:26 -0700] [93263] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2021-10-11 22:11:26 -0700] [93263] [ERROR] Retrying in 1 second.
[2021-10-11 22:11:27 -0700] [93263] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2021-10-11 22:11:27 -0700] [93263] [ERROR] Retrying in 1 second.
[2021-10-11 22:11:28 -0700] [93263] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2021-10-11 22:11:28 -0700] [93263] [ERROR] Retrying in 1 second.
[2021-10-11 22:11:29 -0700] [93263] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2021-10-11 22:11:29 -0700] [93263] [ERROR] Retrying in 1 second.
[2021-10-11 22:11:30 -0700] [93263] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see the logs above for details.
