In [33]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)
from pcsp.module_set import to_tuple, to_list
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from functools import partial
import itertools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Subsampling

In [34]:
#Create Subsampling module set

X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
subsampling_funcs = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.3),
                            random_state=i)
                    for i in range(3)]
subsampling_keys = ["subsampling_1","subsampling_2","subsampling_3"]
subsampling_mods = dict(zip(subsampling_keys, subsampling_funcs))
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)


In [35]:
X,y = subsampling_set([X_train,X_train], [y_train,y_train])

In [36]:
subsampling_set.modules

{('data_0',
  'subsampling_1'): [array([[-0.76922836, -0.80282378,  0.97446489,  0.52052642, -0.04706701],
         [ 1.46225816,  1.54847904, -2.22176999, -0.97837337,  0.10666303],
         [ 1.46225816,  1.54847904, -2.22176999, -0.97837337,  0.10666303],
         [ 0.35920577,  0.36833298, -0.34665559,  0.06639856,  0.01693412],
         [-1.66419241, -1.6993076 ,  1.48757827, -1.02024558, -0.07294152],
         [ 1.93486715,  1.98431937, -1.87202968,  0.22506876,  0.09143761],
         [-1.21655464, -1.12353433, -0.87339908, -0.04691735,  0.03794123],
         [-0.93971597, -0.82327167, -1.41133455,  1.52480386,  0.06359459],
         [ 0.60105129,  0.49754582,  1.38224272, -0.55949076, -0.06299471],
         [-0.53615383, -0.44089283, -1.28142553, -0.25426726,  0.05844693],
         [-0.53615383, -0.44089283, -1.28142553, -0.25426726,  0.05844693]]), array([0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0])],
 ('data_0',
  'subsampling_2'): [array([[ 0.51007275,  0.5627069 , -1.14768583,  1.42704

# Modeling 

In [37]:
modeling_funcs = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_keys = ["LR","DT"]
modeling_mods = dict(zip(modeling_keys,modeling_funcs))
modeling_set = ModuleSet(name = 'modeling', modules = modeling_mods)

In [38]:
models = modeling_set.fit(subsampling_set.modules)
models
#train = [X_train,y_train]
#x = LogisticRegression(max_iter=1000, tol=0.1).fit
#x(*subsampling_set.modules[('item_0','subsampling_1')])

{('data_0', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_0', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_0', 'subsampling_3', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_1', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_1', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_2', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_2', 'DT'): DecisionTreeClassifier(),
 ('data_1', 'subsampling_3', 'LR'): LogisticRegression(max_iter=1000, tol=0.1),
 ('data_1', 'subsampling_3', 'DT'): DecisionTreeClassifier()}

# Prediction

In [39]:
X["test"] = X_test
preds = modeling_set.predict(X)


In [40]:
preds

{(('data_0', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_0', 'subsampling_1', 'DT'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_0', 'subsampling_2', 'LR'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_0', 'subsampling_2', 'DT'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_0', 'subsampling_3', 'LR'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_0', 'subsampling_3', 'DT'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_1', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_1', 'subsampling_1', 'DT'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 (('data_1', 'subsampling_2', 'LR'),
  ('data_0', 'subsampling_1')): array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]),
 

In [41]:
key1 = ('data_0', 'subsampling_1')
key2 = (('data_0', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1'))
set(key1).issubset(key2[-1])

True

# Hard-Metrics

In [42]:
y["test"] = y_test
hard_metrics_funcs = [accuracy_score, balanced_accuracy_score]
hard_metrics_keys = ["Acc","Bal_Acc"] 
hard_metrics_mods = dict(zip(hard_metrics_keys,hard_metrics_funcs))
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
hard_metrics_set.evaluate(y,preds)

2


{(('data_0', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1'),
  'Acc'): 0.9090909090909091,
 (('data_0', 'subsampling_1', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 0.9166666666666667,
 (('data_0', 'subsampling_1', 'DT'),
  ('data_0', 'subsampling_1'),
  'Acc'): 0.9090909090909091,
 (('data_0', 'subsampling_1', 'DT'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 0.9166666666666667,
 (('data_0', 'subsampling_2', 'LR'),
  ('data_0', 'subsampling_1'),
  'Acc'): 0.9090909090909091,
 (('data_0', 'subsampling_2', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 0.9166666666666667,
 (('data_0', 'subsampling_2', 'DT'),
  ('data_0', 'subsampling_1'),
  'Acc'): 0.9090909090909091,
 (('data_0', 'subsampling_2', 'DT'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 0.9166666666666667,
 (('data_0', 'subsampling_3', 'LR'),
  ('data_0', 'subsampling_1'),
  'Acc'): 0.9090909090909091,
 (('data_0', 'subsampling_3', 'LR'),
  ('data_0', 'subsampling_1'),
  'Bal_Acc'): 0.9166666666666667,


In [20]:
y

{('data_0', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_0', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_0', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 ('data_1', 'subsampling_1'): array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]),
 ('data_1', 'subsampling_2'): array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 ('data_1', 'subsampling_3'): array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]),
 'test': array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0])}

## Modeling

In [None]:
X, y = make_classification(n_samples=50, n_features=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# subsampling
subsampling_mods = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.1),
                            random_state=i)
                    for i in range(3)]

subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)

# first set takes in some weird looking args -- might need to fix this
# ([x1, x2, x3], [y1, y2, y3])
X,y = subsampling_set([X_train], [y_train],past_args = {}) # X, y are each lists 
# X1, y1 = X[0], y[0] # access one run of things

# modeling
modeling_mods = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_set = ModuleSet(name='modeling', modules= modeling_mods)
#models = modeling_set.fit(past_args = output_dict)

# metrics

#ys_aligned = modeling_set.repeat(y) # annoying we have to explicitly write this
preds = modeling_set.predict(X_test,past_args = models)
hard_metrics_mods = [accuracy_score, balanced_accuracy_score]
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
#hard_metrics = hard_metrics_set(y_test,past_args = preds) # need to find a way to know to replicate y here

preds_proba = modeling_set.predict_proba(X_test,past_args = models)
soft_metrics_mods = [roc_auc_score]
soft_metrics_set = ModuleSet(name='soft_metrics', modules=soft_metrics_mods)
#soft_metrics = soft_metrics_set(y_test, past_args = preds_proba)


# tracking things w Pipeline

In [None]:
p = PCSPipeline()

In [None]:
p.steps = [subsampling_set, modeling_set, soft_metrics_set] # how to deal w/ hard metrics?

In [None]:
p.run([X_train], [y_train])

In [None]:
df = p.generate_names()
df['soft_metrics'] = soft_metrics
df