In [1]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from functools import partial
X, y = make_classification()

In [2]:
p = PCSPipeline()
p.steps = [
    partial(train_test_split, random_state=42), # outputs X_train, X_test, y_train, y_test
    ModuleSet(name='subsampling', modules=[partial(resample, # subsamples
                                                   n_samples=int(X_train.shape[0]*0.8), random_state=i)
                                           for i in range(3)]),
]
p.run(X, y)

In [10]:
len(p.cache[0])

2

# Using ModuleSet

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# subsampling split
subsampling_mods = [partial(resample, n_samples=int(X_train.shape[0]*0.8), random_state=i)
                    for i in range(3)]
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)


datas = subsampling_set(X_train, y_train)
for (X, y) in datas:

    # modeling
    modeling_mods = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
    modeling_set = ModuleSet(name='modeling', modules=modeling_mods)
    modeling_set.fit(X, y)

    # different potential outputs (split)
    preds_list = modeling_set.predict(X)
    preds_proba_list = modeling_set.predict_proba(X)

    hard_metrics_mods = [accuracy_score, balanced_accuracy_score]
    hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
    for preds in preds_list:
        print(hard_metrics_set(y, preds))

    soft_metrics_mods = [roc_auc_score]
    soft_metrics_set = ModuleSet(name='soft_metrics', modules=soft_metrics_mods)
    for preds_proba in preds_proba_list:
        print(soft_metrics_set(y, preds_proba))

[1.0, 1.0]
[1.0, 1.0]
[1.0]
[1.0]
[1.0, 1.0]
[1.0, 1.0]
[1.0]
[1.0]
[1.0, 1.0]
[1.0, 1.0]
[1.0]
[1.0]
