In [1]:
'''Modified from joblib documentation: https://joblib.readthedocs.io/en/latest/auto_examples/memory_basic_usage.html
'''
%load_ext autoreload
%autoreload 2
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from functools import partial

from pcsp import PCSPipeline, ModuleSet, Module # must install pcsp first (pip install pcsp)

In [2]:
pipeline_uncached = PCSPipeline()
pipeline_cached = PCSPipeline(cache_dir="./")

In [3]:
np.random.seed(13)
X, y = make_classification(n_samples=50, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

def costly_compute(data, row_index=0):
    """Simulate an expensive computation"""
    time.sleep(5)
    return data[row_index, ]

subsampling_funcs = [partial(costly_compute, row_index=np.arange(25))]
subsampling_set1 = ModuleSet(name='subsampling1', modules=subsampling_funcs)
subsampling_set2 = ModuleSet(name='subsampling2', modules=subsampling_funcs)

pipeline_uncached.steps = [subsampling_set1]
pipeline_cached.steps = [subsampling_set2]

In [4]:
%%time

# this always takes about 5 seconds
pipeline_uncached.run([X_train])

subsampling1
CPU times: user 2.2 ms, sys: 358 µs, total: 2.56 ms
Wall time: 5 s


In [5]:
%%time

# the first time this runs it takes 5 seconds, but the next time you run the notebook it's very fast
pipeline_cached.run([X_train])

subsampling2
CPU times: user 5.49 ms, sys: 243 µs, total: 5.74 ms
Wall time: 4.99 ms
