In [9]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)
from pcsp.module_set import to_tuple, to_list
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from functools import partial
import itertools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Subsampling

In [10]:
#Create Subsampling module set

X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_keys = ["subsampling_1","subsampling_2","subsampling_3"]
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                     for i in range(3)]
subsampling_mods = dict(zip(subsampling_keys, subsampling_funcs))
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)

In [11]:
X, y = subsampling_set([X_train, X_train], [y_train, y_train])

In [None]:
X

In [None]:
subsampling_set.modules

# Modeling 

In [37]:
modeling_funcs = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_keys = ["LR","DT"]
modeling_mods = dict(zip(modeling_keys,modeling_funcs))
modeling_set = ModuleSet(name = 'modeling', modules = modeling_mods)

In [38]:
models = modeling_set.fit(subsampling_set.modules)
models
#train = [X_train,y_train]
#x = LogisticRegression(max_iter=1000, tol=0.1).fit
#x(*subsampling_set.modules[('item_0','subsampling_1')])

{('data_0', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_3', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_3', 'DT'): DecisionTreeClassifier()}

# Prediction

In [39]:
X["test"] = X_test
preds = modeling_set.predict(X)

# Hard-Metrics

In [None]:
y["test"] = y_test
hard_metrics_funcs = [accuracy_score, balanced_accuracy_score]
hard_metrics_keys = ["Acc","Bal_Acc"] 
hard_metrics_mods = dict(zip(hard_metrics_keys,hard_metrics_funcs))
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
hard_metrics_set.evaluate(y,preds)

In [20]:
y

{('data_0', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_0', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_0', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 ('data_1', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_1', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_1', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 'test': array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0])}

## Everything in one cell

In [25]:
# subsample data
X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_keys = ["subsampling_1","subsampling_2","subsampling_3"]
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                     for i in range(3)]
subsampling_mods = dict(zip(subsampling_keys, subsampling_funcs))
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)
X_all, y_all = subsampling_set([X_train, X_train], [y_train, y_train])

# fit models
modeling_funcs = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_keys = ["LR","DT"]
modeling_mods = dict(zip(modeling_keys, modeling_funcs))
modeling_set = ModuleSet(name = 'modeling', modules = modeling_mods)
models = modeling_set.fit(subsampling_set.modules)

# get predictions
X_all["test"] = X_test
y_all["test"] = y_test
preds = modeling_set.predict(X_all)

# get metrics
hard_metrics_keys = ["Acc", "Bal_Acc"]
hard_metrics_funcs = [accuracy_score, balanced_accuracy_score]
hard_metrics_mods = dict(zip(hard_metrics_keys,hard_metrics_funcs))
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
hard_metrics_set.evaluate(y_all, preds)

2


{(('data_0', 'subsampling_1', 'LR'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('data_0', 'subsampling_1', 'DT'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_1', 'DT'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('data_0', 'subsampling_2', 'LR'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_2', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('data_0', 'subsampling_2', 'DT'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_2', 'DT'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('data_0', 'subsampling_3', 'LR'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_3', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('data_0', 'subsampling_3', 'DT'), ('data_0', 'subsampling_1'), 'Acc'): 1.0,
 (('data_0', 'subsampling_3', 'DT'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 1.0,
 (('

# tracking things w Pipeline (deprecated)

In [None]:
p = PCSPipeline()

In [None]:
p.steps = [subsampling_set, modeling_set, soft_metrics_set] # how to deal w/ hard metrics?

In [None]:
p.run([X_train], [y_train])

In [None]:
df = p.generate_names()
df['soft_metrics'] = soft_metrics
df