In [1]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)
from pcsp.module_set import to_tuple, to_list
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from functools import partial
import itertools

In [3]:
X, y = make_classification(n_samples=50, n_features=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# subsampling
subsampling_mods = [partial(resample,
                            n_samples=int(X_train.shape[0]*0.8),
                            random_state=i)
                    for i in range(3)]
subsampling_set = ModuleSet(name='subsampling', modules=subsampling_mods)

# first set takes in some weird looking args -- might need to fix this
# ([x1, x2, x3], [y1, y2, y3])
X, y = subsampling_set([X_train], [y_train]) # X, y are each lists 
# X1, y1 = X[0], y[0] # access one run of things

# modeling
modeling_mods = [LogisticRegression(max_iter=1000, tol=0.1), DecisionTreeClassifier()]
modeling_set = ModuleSet(name='modeling', modules=modeling_mods)
models = modeling_set.fit(X, y)

# metrics
ys_aligned = modeling_set.repeat(y) # annoying we have to explicitly write this
preds = modeling_set.predict(X)
hard_metrics_mods = [accuracy_score, balanced_accuracy_score]
hard_metrics_set = ModuleSet(name='hard_metrics', modules=hard_metrics_mods)
hard_metrics = hard_metrics_set(ys_aligned, preds) # need to find a way to know to replicate y here

preds_proba = modeling_set.predict_proba(X)
soft_metrics_mods = [roc_auc_score]
soft_metrics_set = ModuleSet(name='soft_metrics', modules=soft_metrics_mods)
soft_metrics = soft_metrics_set(ys_aligned, preds_proba)

In [4]:
print(hard_metrics)

[0.9655172413793104, 0.95, 0.5862068965517241, 0.5928571428571429, 0.6896551724137931, 0.6740196078431373, 0.4827586206896552, 0.4631578947368421, 0.5517241379310345, 0.5547619047619048, 1.0, 1.0]


# tracking things w Pipeline

In [18]:
p = PCSPipeline()

In [19]:
p.steps = [subsampling_set, modeling_set, soft_metrics_set] # how to deal w/ hard metrics?

In [23]:
df = p.generate_names()
df['soft_metrics'] = soft_metrics
df

Unnamed: 0,subsampling,modeling,soft_metrics
0,0_functool,0_Logistic,0.95
1,0_functool,1_Decision,0.592857
2,1_functool,0_Logistic,0.67402
3,1_functool,1_Decision,0.463158
4,2_functool,0_Logistic,0.554762
5,2_functool,1_Decision,1.0
