In [4]:
import provenance as p
import xarray as xr

In [6]:
def xr_dataset_netcdf_dump(ds, filename, **kwargs):
    return ds.to_netcdf(filename, **kwargs)

def xr_dataset_netcdf_load(filename, **kwargs):
    return xr.open_dataset(filename, **kwargs)

p.serializers.register_serializer('xr_dataset', xr_dataset_netcdf_dump, xr_dataset_netcdf_load,
                                  classes=[xr.Dataset])

In [8]:
p.serializers.serializers

{'joblib': Serializer(name='joblib', dump=<function joblib_dump at 0x11b70c9d0>, load=<function load at 0x11b628af0>, content_type=None, content_encoding=None, content_disposition=None),
 'cloudpickle': Serializer(name='cloudpickle', dump=<function cloudpickle_dump at 0x11b59ff70>, load=<function cloudpickle_load at 0x11b5d9700>, content_type=None, content_encoding=None, content_disposition=None),
 'pd_df_parquet': Serializer(name='pd_df_parquet', dump=<function pd_df_parquet_dump at 0x11b70ce50>, load=<function pd_df_parquet_load at 0x11f2b7040>, content_type=None, content_encoding=None, content_disposition=None),
 'pd_series_parquet': Serializer(name='pd_series_parquet', dump=<function pd_series_parquet_dump at 0x11f2b7160>, load=<function pd_series_parquet_load at 0x11f2b71f0>, content_type=None, content_encoding=None, content_disposition=None),
 'file': Serializer(name='file', dump=<function file_dump at 0x11f2d3310>, load=<function file_load at 0x11f2d3670>, content_type=None, con

In [9]:
p.load_config({'blobstores':
               {'disk': {'type': 'disk',
                         'cachedir': 'artifacts',
                         'read': True,
                         'write': True,
                         'read_through_write': False,
                         'delete': True}},
               'artifact_repos':
               {'local': {'type': 'postgres',
                          'db': 'postgresql://localhost/provenance-basic-example',
                          'store': 'disk',
                          'read': True,
                          'write': True,
                          'create_db': True,
                          'read_through_write': False,
                          'delete': True}},
               'default_repo': 'local'})

@p.provenance()
def my_add(x, y):
    print("Executed")
    return x + y

## Basic Example

In [10]:
import xarray as xr 

In [11]:
ds = xr.tutorial.open_dataset('rasm')
ds

In [16]:
p.serializers.object_serializer(ds)

'xr_dataset'

In [None]:
def 

In [17]:
import xarray as xr
import provenance as p


ds = xr.tutorial.open_dataset('rasm')



# Register xarray serializers via netCDF
def xr_dataset_netcdf_dump(ds, filename, **kwargs):
    return ds.to_netcdf(filename, **kwargs)

def xr_dataset_netcdf_load(filename, **kwargs):
    return xr.open_dataset(filename, **kwargs)

p.serializers.register_serializer('xr_dataset', xr_dataset_netcdf_dump, xr_dataset_netcdf_load,
                                  classes=[xr.Dataset])


@p.provenance()
def anomaly(ds, groupby='time.year'):
    """Compute annual annomalies"""
    group = ds.groupby(groupby)
    clim = group.mean()
    return ds - clim

In [18]:
%%time
anom = anomaly(ds.Tair)
anom

TypeError: save_global() missing 1 required positional argument: 'obj'

In [20]:
import cloudpickle
cloudpickle.__version__

'1.4.1'

In [13]:
import numpy as np
import pandas as pd
import time
from sklearn.utils import check_random_state
import toolz as t

In [14]:
@p.provenance
def load_data(query):
    # fetch something from the DB in real life...
    random_state = check_random_state(abs(hash(query)) // (10**10))
    return random_state.uniform(0, 10, 10)


In [15]:
@p.provenance
def extract_features_a(data, hyperparam_a=5):
    time.sleep(2)
    rs = check_random_state(hyperparam_a)
    return data[0:5] + 1 + rs.rand(5)


In [16]:
@p.provenance()
def load_data(query):
    # fetch something from the DB in real life...
    random_state = check_random_state(abs(hash(query)) // (10**10))
    return random_state.uniform(0, 10, 10)


@p.provenance()
def extract_features_a(data, hyperparam_a=5):
    time.sleep(2)
    rs = check_random_state(hyperparam_a)
    return data[0:5] + 1 + rs.rand(5)


@p.provenance()
def extract_features_b(data, hyperparam_x=10):
    time.sleep(2)
    rs = check_random_state(hyperparam_x)
    return data[5:] + 1 + rs.rand(5)


@p.provenance()
def build_model(features_a, features_b, num_trees=100):
    return {'whatever': 'special model with {} trees'.format(num_trees)}


@p.provenance()
def evaluate(model, data):
    return {'some_metric': 0.5, 'another_metric': 0.4}


def pipeline(train_query='some query', valid_query="another query", hyperparam_a=5, hyperparam_x=10):
    data = load_data("some query")
    features_a = extract_features_a(data, hyperparam_a)
    features_b = extract_features_b(data, hyperparam_x)
    model = build_model(data, features_a, features_b)

    validation_data = load_data("another query")
    evaluation = evaluate(model, validation_data)

    return {'features_a': features_a, 'features_b': features_b,
            'model': model, 'evaluation': evaluation}


@p.provenance()
def make_decision(model, request):
    # make some sort of prediction, classification, with the model
    # to help make a 'decision' and return it as the result
    return {'prediction': 0.5, 'model': model.artifact.id}

In [17]:
def run_production_pipeline():
    with p.capture_set('production'):
        return pipeline()

In [18]:
res = run_production_pipeline()

In [19]:
res = p.load_set_by_name('production')

In [20]:
res

ArtifactSet(id='aba6ca7c03863c3be02247b7040fd350', artifact_ids=frozenset({'5fb1d4f00fd88dcb207c264a11f987b0', 'cad2eb7dec8c0589186b2312bc231a0a', 'e6297d97c2087f841709266e0d9804f3', '662bdf941165e1977da5481359a5ee26', '088a88ad7ec6a4fef5ee698493876985', 'b0b1af5929fc98f235eafbff0648b58e'}), created_at=datetime.datetime(2020, 5, 22, 7, 22, 54, 888784), labels={'name': 'production'})

In [21]:
data = load_data("some query")

In [22]:
data

<provenance.ArtifactProxy(e6297d97c2087f841709266e0d9804f3) array([1.62814315, 5.33644596, 2.87524842, 6.13147961, 1.43413248,
       5.48433383, 1.33431597, 4.62099253, 9.7118973 , 7.01740949]) >

In [23]:
@p.provenance()
def append_3_inc(a):
    a.append(3)
    return [n + 1 for n in a]

In [24]:
x = append_3_inc([1, 2])

ImpureFunctionError: The __main__.append_3_inc function modified arguments: (a)

In [None]:
@p.provenance()
def load_data_y():
    return [1, 2, 3]

@p.provenance()
def process_data_y(data):
    return list(map(lambda x: x + 1, data))

data = load_data_y()

In [None]:
data

In [None]:
result = expensive_add(4, 50)

In [None]:
result

In [None]:
@p.provenance()
def load_data_y(x, y):
    return [1, 2, 3] * 2

In [None]:
data = load_data_y(6, 3)

In [None]:
data

In [None]:
@p.provenance()
def load_data(x, y):
    return [x + y, x, y]

In [None]:
load_data(3, 4)

In [None]:
@p.provenance(name='read_only_test')
def increase(x):
    return x + 1

a = increase(5)
assert a == 6

@p.provenance(name='read_only_test', read_only=True)
def load_increase(x):
    pass

# We expect the values to be the same, and artifacts to be different
b = load_increase(5)
assert b == 6
assert b.artifact.id == a.artifact.id

not_found = load_increase(34)

In [None]:
not_found == None

In [None]:
not_found

In [None]:
not_found.artifact.value is None

In [None]:
import pandas as pd

In [25]:
@p.provenance()
def make_df(rows):
    return pd.DataFrame(rows)
    
df = make_df([{'foo': 42}, {'foo': 100}])

TypeError: save_global() missing 1 required positional argument: 'obj'

In [None]:
df