# Introduction:

This material has been used in the past to teach colleagues in our group how to use persistable.

The `persistable` package provides users an interface to create a parameterized persistable payload that automatically persists and loads based on the parameters provided.  That means, the payload parameters and are used to define unique artifacts that can be reloaded so complex calculations never have to be repeated.

All you need to do is define:
1. How the payload is generated
2. The parameters
3. A working directory (can be local or cloud) for where artifacts should be persisted


For more details, read the [docs](https://github.com/DataReply/persistable).

# Examples
## Gaussian Distributed Dataset
### Define `Persistable` class
In this example, let's say that our persisted object is generated from a random number generator

In [1]:
from dataclasses import dataclass
from persistable import PersistableParams


@dataclass
class GaussianDistributedPointsParams(PersistableParams):
    """ Params for GaussianDistributedPoints.
    
    Parameters:
        n (int): number of gaussian distributed points.
        random_state (int): random_state for generator.
    """
    
    n: int
    random_state: int = 100


from persistable import Persistable
from numpy.typing import NDArray
import numpy as np
from typing import Any


class GaussianDistributedPointsP(Persistable[NDArray[np.float64], GaussianDistributedPointsParams]):
    """ Persistable payload of Gaussian distributed points.
    
    """

    def _generate_payload(self, **untracked_payload_params: Any) -> NDArray[np.float64]:
        np.random.seed(self.params.random_state)
        return np.random.random(self.params.n)


### Instantiate and load/generate+persist payload

In [2]:
from pathlib import Path

data_dir = Path('.').absolute() / "example-data"
params = GaussianDistributedPointsParams(n=100, random_state=10)

gaussian_distributed_points_p = GaussianDistributedPointsP(
    data_dir=data_dir,
    params=params,
    tracked_persistable_dependencies=None,
    verbose=True
)
gaussian_distributed_points_p.generate(persist=True)
gaussian_distributed_points_p.payload[:3]

2022-07-21 16:55:23,751 - gaussian_distributed_points_p - __init__ - INFO - ---- NEW PERSISTABLE SESSION ---- (/Users/aloosley/Repos/persistable/examples/example-data)
2022-07-21 16:55:23,752 - gaussian_distributed_points_p - __init__ - INFO - Payload named gaussian_distributed_points_p; Parameters set to GaussianDistributedPointsParams(n=100, random_state=10)
2022-07-21 16:55:23,752 - gaussian_distributed_points_p - generate - INFO - Now generating gaussian_distributed_points_p payload...
2022-07-21 16:55:23,755 - gaussian_distributed_points_p - persist - INFO - Successfully persisted payload to gaussian_distributed_points_p(56d54cea068e7beb3ad8c9df70ff8441).persistable (see gaussian_distributed_points_p(56d54cea068e7beb3ad8c9df70ff8441).params.json to view corresponding params).


array([0.77132064, 0.02075195, 0.63364823])

### Check payload was persisted

In [3]:
list(gaussian_distributed_points_p.persist_filepath.parent.glob("*"))

[PosixPath('/Users/aloosley/Repos/persistable/examples/example-data/gaussian_distributed_points_p(56d54cea068e7beb3ad8c9df70ff8441).params.json'),
 PosixPath('/Users/aloosley/Repos/persistable/examples/example-data/gaussian_distributed_points_p.log'),
 PosixPath('/Users/aloosley/Repos/persistable/examples/example-data/outlier_estimator_p.log'),
 PosixPath('/Users/aloosley/Repos/persistable/examples/example-data/gaussian_distributed_points_p(56d54cea068e7beb3ad8c9df70ff8441).persistable')]

### Load payload

In [4]:
gaussian_distributed_points_p_2 = GaussianDistributedPointsP(
    data_dir=data_dir,
    params=params,
    tracked_persistable_dependencies=None,
    verbose=True
)
gaussian_distributed_points_p_2.load()
gaussian_distributed_points_p_2.payload[:3]

2022-07-21 16:55:23,837 - gaussian_distributed_points_p - __init__ - INFO - ---- NEW PERSISTABLE SESSION ---- (/Users/aloosley/Repos/persistable/examples/example-data)
2022-07-21 16:55:23,844 - gaussian_distributed_points_p - __init__ - INFO - Payload named gaussian_distributed_points_p; Parameters set to GaussianDistributedPointsParams(n=100, random_state=10)
2022-07-21 16:55:23,854 - gaussian_distributed_points_p - load - INFO - Now loading gaussian_distributed_points_p payload...
2022-07-21 16:55:23,857 - gaussian_distributed_points_p - load - INFO - Successfully loaded payload from gaussian_distributed_points_p(56d54cea068e7beb3ad8c9df70ff8441).persistable


array([0.77132064, 0.02075195, 0.63364823])

## Outlier Detection Model

### Define `Persistable` class

In [5]:
@dataclass
class OutlierEstimatorParams(PersistableParams):
    """ Params for OutlierEstimator.
    
    Parameters:
        z_threshold (float): number of standard deviations from the mean for which to consider a point an outlier.
    """
    
    z_threshold: int

        

from typing import Optional, Any, List


class OutlierEstimator:
    def __init__(self, z_threshold: float) -> None:
        self.z_threshold = z_threshold
        
        self._mean = Optional[float]
        self._stdev = Optional[float]
    
    def fit(self, data: NDArray[np.float64]) -> None:
        self._mean = np.mean(data)
        self._stdev = np.std(data)
        
    def transform(self, data: NDArray[np.float64]) -> NDArray[np.float64]:
        return np.abs((data - self._mean) / self._stdev) > self.z_threshold
        
        
        
class OutlierEstimatorP(Persistable[OutlierEstimator, OutlierEstimatorParams]):
    """ Persistable payload of Gaussian distributed points.
    
    """
    def __init__(
        self,
        data_dir: Path,
        params: OutlierEstimatorParams,
        *,
        data_points_p: GaussianDistributedPointsP,
    ) -> None:
        super().__init__(data_dir, params, tracked_persistable_dependencies=[data_points_p], verbose=True)
        self.data_points_p = data_points_p

    def _generate_payload(self, **untracked_payload_params: Any) -> OutlierEstimator:
        outlier_estimator = OutlierEstimator(z_threshold = self.params.z_threshold)
        outlier_estimator.fit(self.data_points_p.payload)
        
        return outlier_estimator

## Instiate and load/generate+persist

In [6]:
outlier_estimator_params = OutlierEstimatorParams(z_threshold=1)
outlier_estimator_p = OutlierEstimatorP(
    data_dir=data_dir, 
    params=outlier_estimator_params, 
    data_points_p=gaussian_distributed_points_p,
)
outlier_estimator_p.load_generate()

2022-07-21 16:55:24,627 - outlier_estimator_p - __init__ - INFO - ---- NEW PERSISTABLE SESSION ---- (/Users/aloosley/Repos/persistable/examples/example-data)
2022-07-21 16:55:24,628 - outlier_estimator_p - __init__ - INFO - Payload named outlier_estimator_p; Parameters set to OutlierEstimatorParams(z_threshold=1)
2022-07-21 16:55:24,629 - outlier_estimator_p - load - INFO - Now loading outlier_estimator_p payload...
2022-07-21 16:55:24,630 - outlier_estimator_p - load_generate - INFO - Loading payload failed, continuing to generate payload...
2022-07-21 16:55:24,630 - outlier_estimator_p - generate - INFO - Now generating outlier_estimator_p payload...
2022-07-21 16:55:24,634 - outlier_estimator_p - persist - INFO - Successfully persisted payload to outlier_estimator_p(9dc0f62720b5fc1b55e386c4996da97a).persistable (see outlier_estimator_p(9dc0f62720b5fc1b55e386c4996da97a).params.json to view corresponding params).
