In [1]:
'''Modified from joblib documentation: https://joblib.readthedocs.io/en/latest/auto_examples/memory_basic_usage.html
'''
%load_ext autoreload
%autoreload 2
import time
from functools import partial

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from vflow import Vset, init_args

In [2]:
np.random.seed(13)
X, y = make_classification(n_samples=50, n_features=5)
X, y = init_args([X, y], names=['X', 'y'])

def costly_compute(data, row_index=0):
    """Simulate an expensive computation"""
    time.sleep(5)
    return data[row_index,]

subsampling_funcs = [partial(costly_compute, row_index=np.arange(25))]

uncached_set = Vset(name='subsampling_uncached', modules=subsampling_funcs)
cached_set = Vset(name='subsampling_cached', modules=subsampling_funcs, cache_dir='./')

In [3]:
%%time

# this always takes about 5 seconds
uncached_set.fit(X)
uncached_set.out

CPU times: user 1.55 ms, sys: 219 µs, total: 1.76 ms
Wall time: 5 s


{('X',
  'subsampling_uncached_0'): array([[-1.48243722,  0.76640114, -1.1897997 , -0.3226144 ,  0.35136153],
        [-0.06355468,  3.14932047,  2.45990218,  3.51847552,  0.27783616],
        [-1.47232803,  1.09490649, -0.91280101,  0.05784853,  0.78857048],
        [-1.22242099,  0.8517753 , -0.80402087, -0.01690056,  0.2283352 ],
        [ 2.03442739, -1.18542833,  1.52514029,  0.29125138, -0.6606325 ],
        [ 0.9401976 , -0.83174787,  0.4760899 , -0.18719308,  0.13679032],
        [-1.61601535,  0.49442318, -1.57177933, -0.73822631, -0.51644285],
        [ 3.28036648, -1.68208958,  2.64394545,  0.72954939, -1.53906412],
        [-0.30582021, -0.97541273, -1.15871714, -1.35132194, -0.54657674],
        [-1.70489766,  1.13784372, -1.16173936, -0.08037683, -0.403129  ],
        [-0.41226723, -1.02438726, -1.32794826, -1.49237167,  0.40865865],
        [ 2.17363048, -1.28033281,  1.61838309,  0.29554635, -1.2368377 ],
        [ 0.43301793, -0.457755  ,  0.1590953 , -0.170864  , -0.9

In [4]:
%%time

# the first time this runs it takes 5 seconds, but the next time you run the notebook it's very fast
cached_set.fit(X)
cached_set.out

CPU times: user 3.74 ms, sys: 0 ns, total: 3.74 ms
Wall time: 3.23 ms


{('X',
  'subsampling_cached_0'): array([[-1.48243722,  0.76640114, -1.1897997 , -0.3226144 ,  0.35136153],
        [-0.06355468,  3.14932047,  2.45990218,  3.51847552,  0.27783616],
        [-1.47232803,  1.09490649, -0.91280101,  0.05784853,  0.78857048],
        [-1.22242099,  0.8517753 , -0.80402087, -0.01690056,  0.2283352 ],
        [ 2.03442739, -1.18542833,  1.52514029,  0.29125138, -0.6606325 ],
        [ 0.9401976 , -0.83174787,  0.4760899 , -0.18719308,  0.13679032],
        [-1.61601535,  0.49442318, -1.57177933, -0.73822631, -0.51644285],
        [ 3.28036648, -1.68208958,  2.64394545,  0.72954939, -1.53906412],
        [-0.30582021, -0.97541273, -1.15871714, -1.35132194, -0.54657674],
        [-1.70489766,  1.13784372, -1.16173936, -0.08037683, -0.403129  ],
        [-0.41226723, -1.02438726, -1.32794826, -1.49237167,  0.40865865],
        [ 2.17363048, -1.28033281,  1.61838309,  0.29554635, -1.2368377 ],
        [ 0.43301793, -0.457755  ,  0.1590953 , -0.170864  , -0.977