In [101]:
'''Modified from sklearn documentation: https://scikit-learn.org/stable/modules/compose.html
'''
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets
from sklearn.linear_model import LogisticRegression, Ridge,RidgeCV
# from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score,r2_score
from sklearn.model_selection import train_test_split
import sklearn.utils
import os
import sys
from functools import partial
import itertools
import networkx as nx
np.set_printoptions(threshold=5) # to limit printing

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
sys.path.append('../')
import pcsp
from pcsp import PCSPipeline, ModuleSet, Module, init_args # must install pcsp first (pip install pcsp)
from pcsp.pipeline import build_graph

# basic pipeline

In [92]:
# initialize data
np.random.seed(13)
X, y = sklearn.datasets.make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # ex. with another split?
X_train, X_test, y_train, y_test = init_args((X_train, X_test, y_train, y_test), # could run this line higher (on X, y)
                                              names=['X_train', 'X_test', 'y_train', 'y_test'])  # optionally provide names for each of these

# subsample data
subsampling_funcs = [partial(sklearn.utils.resample,
                            n_samples=20,
                            random_state=i)
                     for i in range(3)]
subsampling_set = ModuleSet(name='subsampling',
                            modules=subsampling_funcs)
X_trains, y_trains = subsampling_set(X_train, y_train) # subsampling_set([X_train, X_train], [y_train, y_train]) # artificially make it seem like there are multiple dsets (data_0 and data_1)


#fit models
modeling_set = ModuleSet(name='modeling',
                          modules=[LogisticRegression(max_iter=1000, tol=0.1),
                                   DecisionTreeClassifier()],
                          module_keys=["LR", "DT"], out={})

modeling_set.fit(X_trains, y_trains)
preds_test = modeling_set.predict(X_test)

# get metrics
hard_metrics_set = ModuleSet(name='hard_metrics',
                              modules=[accuracy_score, balanced_accuracy_score],
                              module_keys=["Acc", "Bal_Acc"], out={})

hard_metrics = hard_metrics_set.evaluate(preds_test, y_test)
# #hard_metrics.__prev__[0]
 #inspect the pipeline
#for k1, v1 in hard_metrics.items():
#     print(k1)
# G = build_graph(hard_metrics, draw=True)
# plt.show()
hard_metrics

{(('X_train', 'y_train', 'subsampling_0', 'LR', 'X_test'),
  'y_test',
  'Acc'): 0.9230769230769231,
 (('X_train', 'y_train', 'subsampling_0', 'LR', 'X_test'),
  'y_test',
  'Bal_Acc'): 0.9444444444444444,
 (('X_train', 'y_train', 'subsampling_0', 'DT', 'X_test'),
  'y_test',
  'Acc'): 0.7692307692307693,
 (('X_train', 'y_train', 'subsampling_0', 'DT', 'X_test'),
  'y_test',
  'Bal_Acc'): 0.7638888888888888,
 (('X_train', 'y_train', 'subsampling_1', 'LR', 'X_test'),
  'y_test',
  'Acc'): 0.9230769230769231,
 (('X_train', 'y_train', 'subsampling_1', 'LR', 'X_test'),
  'y_test',
  'Bal_Acc'): 0.9444444444444444,
 (('X_train', 'y_train', 'subsampling_1', 'DT', 'X_test'),
  'y_test',
  'Acc'): 0.8461538461538461,
 (('X_train', 'y_train', 'subsampling_1', 'DT', 'X_test'),
  'y_test',
  'Bal_Acc'): 0.9,
 (('X_train', 'y_train', 'subsampling_2', 'LR', 'X_test'),
  'y_test',
  'Acc'): 0.8461538461538461,
 (('X_train', 'y_train', 'subsampling_2', 'LR', 'X_test'),
  'y_test',
  'Bal_Acc'): 0.837

In [93]:
X_trains

{('X_train',
  'y_train',
  'subsampling_0'): array([[-0.30582021, -0.97541273, -1.15871714, -1.35132194, -0.54657674],
        [-1.43494529,  1.1326585 , -0.83681009,  0.13067855, -0.84447641],
        [-1.43494529,  1.1326585 , -0.83681009,  0.13067855, -0.84447641],
        ...,
        [-0.2177465 , -1.02788437, -1.09361985, -1.34001946,  0.89179417],
        [-0.1629447 , -1.15701078, -1.13084551, -1.44233728,  0.40067367],
        [ 1.07469058, -0.93785005,  0.55456857, -0.19937499, -0.88249039]]),
 ('X_train',
  'y_train',
  'subsampling_1'): array([[-1.48243722,  0.76640114, -1.1897997 , -0.3226144 ,  0.35136153],
        [-0.1629447 , -1.15701078, -1.13084551, -1.44233728,  0.40067367],
        [ 1.07469058, -0.93785005,  0.55456857, -0.19937499, -0.88249039],
        ...,
        [-0.11906892,  1.93190004,  1.41135673,  2.09399849, -0.41926032],
        [ 0.06323364, -1.00202171, -0.73023127, -1.08491149, -0.86779981],
        [-0.4063791 , -1.53317147, -1.73069344, -2.064313

In [94]:
y_trains

{('X_train', 'y_train', 'subsampling_0'): array([0, 0, 0, ..., 0, 0, 1]),
 ('X_train', 'y_train', 'subsampling_1'): array([0, 0, 1, ..., 1, 0, 0]),
 ('X_train', 'y_train', 'subsampling_2'): array([1, 0, 0, ..., 0, 0, 0]),
 '__prev__': <pcsp.module_set.ModuleSet at 0x134559610>}

# feature engineering pipeline
**this data set predicts boston house-preices dataset (regression)**

In [106]:
# get data as df
np.random.seed(13)
data = sklearn.datasets.load_boston()
df = pd.DataFrame.from_dict(data['data'])
df.columns = data['feature_names']
y = data['target']
X_train, X_test, y_train, y_test = init_args(train_test_split(df, y, random_state=123),
                                             names=['X_train', 'X_test', 'y_train', 'y_test'])


# feature extraction - extracts two different sets of features from the same data
def extract_feats(df: pd.DataFrame, feat_names=['CRIM', 'ZN', 'INDUS', 'CHAS']):
    '''extract specific columns from dataframe
    '''
    return df[feat_names]
feat_extraction_funcs = [partial(extract_feats, feat_names=['CRIM', 'ZN', 'INDUS', 'CHAS']),
                         partial(extract_feats, feat_names=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE']),
                        ]
feat_extraction = ModuleSet(name='feat_extraction',
                            modules=feat_extraction_funcs)

X_feats_train = feat_extraction(X_train)
X_feats_test = feat_extraction(X_test)



modeling_set = ModuleSet(name='modeling',
                         modules=[#RidgeCV(max_iter=1000, tol=0.1),
                                  
                                  DecisionTreeRegressor(),RidgeCV()],
                         module_keys=["Ridge", "DT"])

# how can we properly pass a y here so that it will fit properly?
# this runs, but modeling_set.out is empty
_ = modeling_set.fit(X_feats_train,y_train)

# #get predictions
preds_all = modeling_set.predict(X_feats_train)

# y_test_dict = {('data_0', 'feat_extraction_0'): y_test['X_test'], ('data_0', 'feat_extraction_1'): y_test['X_test']}

#get metrics
hard_metrics_set = ModuleSet(name='hard_metrics',
                              modules=[r2_score],
                             module_keys=["r2"])
hard_metrics = hard_metrics_set.evaluate(preds_all, y_train)



# inspect the pipeline
#for k in hard_metrics:
#     print(k, hard_metrics[k])
#G = build_graph(hard_metrics, draw=True)
#plt.show()
hard_metrics

  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


{(((('X_train', 'feat_extraction_0'), 'y_train', 'Ridge'),
   ('X_train', 'feat_extraction_0')),
  'y_train',
  'r2'): 0.998077138036731,
 (((('X_train', 'feat_extraction_0'), 'y_train', 'DT'),
   ('X_train', 'feat_extraction_0')),
  'y_train',
  'r2'): -1.1560645978534585,
 (((('X_train', 'feat_extraction_1'), 'y_train', 'Ridge'),
   ('X_train', 'feat_extraction_1')),
  'y_train',
  'r2'): 1.0,
 (((('X_train', 'feat_extraction_1'), 'y_train', 'DT'),
   ('X_train', 'feat_extraction_1')),
  'y_train',
  'r2'): 0.34917016148850655,
 '__prev__': <pcsp.module_set.ModuleSet at 0x1345a9670>}

In [100]:
preds_all

{((('X_train', 'feat_extraction_0'), 'y_train', 'Ridge'),
  ('X_train',
   'feat_extraction_0')): array([25.21976108, 22.22680587, 18.18331619, ..., 24.11794761,
        17.0338256 , 18.10485095]),
 ((('X_train', 'feat_extraction_0'), 'y_train', 'DT'),
  ('X_train',
   'feat_extraction_0')): array([21.1, 13.4, 17.4, ..., 20.4, 11.3, 27.5]),
 ((('X_train', 'feat_extraction_1'), 'y_train', 'Ridge'),
  ('X_train',
   'feat_extraction_1')): array([22.64986892, 15.6034198 , 17.43978791, ..., 21.96476899,
        13.32102878, -0.80179865]),
 ((('X_train', 'feat_extraction_1'), 'y_train', 'DT'),
  ('X_train',
   'feat_extraction_1')): array([21.1, 13.4, 17.4, ..., 20.4, 11.3, 27.5]),
 '__prev__': <pcsp.module_set.ModuleSet at 0x1345ac5e0>}

In [30]:
#print(X_feats_train) 
#X_feats_train(),still weird that this is a list
#X_feats_train
# TODO: this is broken atm.. may want to preserve ModuleSet.modules in their original form of modules / functions
#X_feats_test = feat_extraction(X_test) # still weird that this is a list
#fit models
#y = {
#     'data_0': y_train,
#     'data_1': y_test
#}

#y_train_dict = {('X_train', 'feat_extraction_0'): y_train['X_train'], ('X_train', 'feat_extraction_1'): y_train['X_train']}

{'Ridge': Ridge(max_iter=1000, tol=0.1), 'DT': DecisionTreeRegressor()}

# tracking things w Pipeline (deprecated)

In [8]:
p = PCSPipeline()

In [9]:
p.steps = [subsampling_set, modeling_set, hard_metrics_set] # how to deal w/ hard metrics?

In [10]:
# broken
# p.run(X_train, y_train)

In [11]:
p.steps[2].modules

{'r2': <function sklearn.metrics._regression.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>}

In [12]:
# broken
#df = p.generate_names()
#df['hard_metrics'] = hard_metrics
#df