In [15]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)
from pcsp.module_set import to_tuple, to_list
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from functools import partial
import itertools
np.set_printoptions(threshold=5) # to limit printing

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Entire pipeline

In [71]:
# subsample data
np.random.seed(13)
X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                     for i in range(3)]
subsampling_set = ModuleSet(name='subsampling',
                            modules=subsampling_funcs)
X_all, y_all = subsampling_set([X_train], [y_train]) # subsampling_set([X_train, X_train], [y_train, y_train]) # artificially make it seem like there are multiple dsets (data_0 and data_1)

# fit models
modeling_set = ModuleSet(name='modeling',
                         modules=[LogisticRegression(max_iter=1000, tol=0.1),
                                  DecisionTreeClassifier()],
                         module_keys=["LR", "DT"])
models = modeling_set.fit(X_all, y_all)  # ModuleSet needs to store something for this call to work (makes models kind of useless)

# get predictions
X_all["test"] = X_test
y_all["test"] = y_test
preds_all = modeling_set.predict(X_all)

# get metrics
hard_metrics_set = ModuleSet(name='hard_metrics',
                             modules=[accuracy_score, balanced_accuracy_score],
                             module_keys=["Acc", "Bal_Acc"])
hard_metrics = hard_metrics_set.evaluate(y_all, preds_all)
for k in hard_metrics:
    print(k, hard_metrics[k])

(('data_0', 'subsampling_0', 'LR'), ('data_0', 'subsampling_0'), 'Acc') 0.9090909090909091
(('data_0', 'subsampling_0', 'LR'), ('data_0', 'subsampling_0'), 'Bal_Acc') 0.8333333333333333
(('data_0', 'subsampling_0', 'DT'), ('data_0', 'subsampling_0'), 'Acc') 0.9090909090909091
(('data_0', 'subsampling_0', 'DT'), ('data_0', 'subsampling_0'), 'Bal_Acc') 0.8333333333333333
(('data_0', 'subsampling_1', 'LR'), ('data_0', 'subsampling_0'), 'Acc') 0.9090909090909091
(('data_0', 'subsampling_1', 'LR'), ('data_0', 'subsampling_0'), 'Bal_Acc') 0.8333333333333333
(('data_0', 'subsampling_1', 'DT'), ('data_0', 'subsampling_0'), 'Acc') 0.9090909090909091
(('data_0', 'subsampling_1', 'DT'), ('data_0', 'subsampling_0'), 'Bal_Acc') 0.8333333333333333
(('data_0', 'subsampling_2', 'LR'), ('data_0', 'subsampling_0'), 'Acc') 0.9090909090909091
(('data_0', 'subsampling_2', 'LR'), ('data_0', 'subsampling_0'), 'Bal_Acc') 0.8333333333333333
(('data_0', 'subsampling_2', 'DT'), ('data_0', 'subsampling_0'), 'Acc'

In [46]:
str(preds_all['__prev__'])

'ModuleSet(modeling)'

In [16]:
# subsample data
np.random.seed(13)
X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                     for i in range(3)]
subsampling_set = ModuleSet(name='subsampling',
                            modules=subsampling_funcs)
X_all, y_all = subsampling_set([X_train], [y_train])

In [18]:
X_all

{('data_0',
  'subsampling_0'): array([[-0.30582021, -0.97541273, -1.15871714, -1.35132194, -0.54657674],
        [-1.43494529,  1.1326585 , -0.83681009,  0.13067855, -0.84447641],
        [-1.43494529,  1.1326585 , -0.83681009,  0.13067855, -0.84447641],
        ...,
        [-0.20317177, -0.50101282, -0.65135496, -0.73113299, -0.93235926],
        [-1.35884162,  0.85250676, -0.96974597, -0.12569848, -2.2068671 ],
        [-1.35884162,  0.85250676, -0.96974597, -0.12569848, -2.2068671 ]]),
 ('data_0',
  'subsampling_1'): array([[-1.48243722,  0.76640114, -1.1897997 , -0.3226144 ,  0.35136153],
        [-0.1629447 , -1.15701078, -1.13084551, -1.44233728,  0.40067367],
        [ 1.07469058, -0.93785005,  0.55456857, -0.19937499, -0.88249039],
        ...,
        [-1.22242099,  0.8517753 , -0.80402087, -0.01690056,  0.2283352 ],
        [-1.48243722,  0.76640114, -1.1897997 , -0.3226144 ,  0.35136153],
        [-1.70489766,  1.13784372, -1.16173936, -0.08037683, -0.403129  ]]),
 ('data_

# Subsampling

In [10]:
#Create Subsampling module set

X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_keys = ["subsampling_1","subsampling_2","subsampling_3"]
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                     for i in range(3)]
subsampling_mods = dict(zip(subsampling_keys, subsampling_funcs))
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)

In [11]:
X, y = subsampling_set([X_train, X_train], [y_train, y_train])

In [None]:
X

In [None]:
subsampling_set.modules

# Modeling 

In [37]:
modeling_funcs = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_keys = ["LR","DT"]
modeling_mods = dict(zip(modeling_keys,modeling_funcs))
modeling_set = ModuleSet(name = 'modeling', modules = modeling_mods)

In [38]:
models = modeling_set.fit(subsampling_set.modules)
models
#train = [X_train,y_train]
#x = LogisticRegression(max_iter=1000, tol=0.1).fit
#x(*subsampling_set.modules[('item_0','subsampling_1')])

{('data_0', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_3', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_3', 'DT'): DecisionTreeClassifier()}

# Prediction

In [39]:
X["test"] = X_test
preds = modeling_set.predict(X)

# Hard-Metrics

In [None]:
y["test"] = y_test
hard_metrics_funcs = [accuracy_score, balanced_accuracy_score]
hard_metrics_keys = ["Acc","Bal_Acc"] 
hard_metrics_mods = dict(zip(hard_metrics_keys,hard_metrics_funcs))
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
hard_metrics_set.evaluate(y,preds)

In [20]:
y

{('data_0', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_0', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_0', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 ('data_1', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_1', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_1', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 'test': array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0])}

# tracking things w Pipeline (deprecated)

In [None]:
p = PCSPipeline()

In [None]:
p.steps = [subsampling_set, modeling_set, soft_metrics_set] # how to deal w/ hard metrics?

In [None]:
p.run([X_train], [y_train])

In [None]:
df = p.generate_names()
df['soft_metrics'] = soft_metrics
df