In [1]:
# Add mlapi package to sys path

MLAPI_PATH = '/home/jovyan/mlapi'

import sys

if MLAPI_PATH not in sys.path:
    sys.path.append(MLAPI_PATH)

In [2]:
import requests

In [3]:
%load_ext autoreload
%autoreload 2

# Json request

In [4]:
# Dummy request.

forecaster_data = {
    'name': 'sample',
    'algorithm': 'seq2seq',
    'forecast_horizon': 10,
    'perform_hpo': False,
    'dataset_name': 'sample',
}

# ParquetLoader

In [4]:
from mlapi.celery_app.ml.datasources.s3 import ParquetLoader
from mlapi.celery_app.client_args import ClientArgs
#from mlapi.main import Forecaster
#from minio import Minio

In [50]:
# Client args (authentication).

client_args = {
    "s3_endpoint": 'minio:9000',
    "access_key": 'user',
    "secret_key": 'password',
    'secure': False
}
client_args = ClientArgs(**client_args)

In [53]:
# Parquet getter: getter for parquet datasets stored in minio buckets.
parquet_loader = ParquetLoader(
    'sample_project',
    'X-train',
    client_args
)

In [55]:
parquet_loader.load_all()

{'target': <pyarrow.parquet.ParquetDataset at 0xffff4f5b5280>}

# ParquetMerger

In [24]:
from mlapi.ml._parquet_mergers import TimeSeriesMerger

In [25]:
datasets = parquet_loader.resolve_datasets()
merger = TimeSeriesMerger(**datasets)

In [26]:
X = merger.merge()

In [30]:
merger.get_names()

{'target': ['target'],
 'time_varying_known_reals': [],
 'time_varying_unknown_reals': [],
 'static_categoricals': []}

# PreprocessorCreator

In [13]:
from mlapi.ml._preprocessor import PreprocessorCreator

In [14]:
group_ids = merger.get_group_ids()
timestamp = 'timestamp'
preprocessor_creator = PreprocessorCreator(group_ids, timestamp)
preprocessor = preprocessor_creator.create_preprocessor()

# EstimatorCreator

In [31]:
from mlapi.ml._estimator import EstimatorCreator

In [44]:
estimator_creator = EstimatorCreator(predictor)

In [45]:
features_time_dependence = merger.get_names()

In [47]:
estimator = estimator_creator.create_estimator(group_ids, **features_time_dependence)

# Forecasting task

In [5]:
from mlapi.celery_app.ml.datasources.s3 import ParquetLoader
from mlapi.celery_app.ml.estimator import EstimatorCreator
from mlapi.celery_app.ml.parquet_resolver import TimeSeriesResolver
from mlapi.celery_app.ml.preprocessor import PreprocessorCreator
from mlapi.celery_app.client_args import ClientArgs
from mlapi.celery_app.ml.utils.data import AttrDict
from mlapi.celery_app.ml.utils.pandas import duplicate_pandas_column
from sklearn.pipeline import Pipeline

In [6]:
from pydantic import BaseModel
from typing import Optional


class User(BaseModel):
    username: str
    email: Optional[str] = None
    full_name: Optional[str] = None
    disabled: Optional[bool] = None
    access_key: Optional[str] = None
    secret_key: Optional[str] = None
    s3_endpoint: Optional[str] = None
    

class Forecaster(BaseModel):
    task_name: str
    dataset_group_name: str
    dataset_name: str
    algorithm: str
    forecast_horizon: int
    perform_hpo: bool

In [9]:
class CreateForecasterTask:
    """Loads, preprocess and fits data from s3.
    """

    METRICS = ['train_loss']

    def run(self, forecaster_data, user_data):
        forecaster = AttrDict(forecaster_data)
        user = AttrDict(user_data)

        # Load data.
        resolved = self._resolve_dataset(forecaster, user)
        X = resolved['X']
        group_ids = resolved['group_ids']
        timestamp = resolved['timestamp']

        # Create both preprocessor and estimator.
        preprocessor = self._create_preprocessor(group_ids, timestamp)
        estimator = self._create_estimator(forecaster, group_ids)
        
        return preprocessor, X

        # Put everything inside a sklearn Pipeline and fit.
        pipeline = self._fit_pipeline(X, preprocessor, estimator)
        return pipeline, X

        # Save metrics
        logger = MlFlowLogger()
        for metric in self.METRICS:
            estimator = pipeline['estimator']
            history = get_history(estimator, metric)
            logger.save_metric(name=metric, values=history)

        # Save model the model with a signature that defines the schema of
        # the model's inputs and outputs. When the model is deployed, this
        # signature will be used to validate inputs.
        wrapped_pipeline = wrap_pipeline(pipeline)
        signature = infer_signature(X, wrapped_pipeline.predict(None, X))
        logger.save_model(
            name=forecaster.task_name, model=wrapped_pipeline,
            signature=signature)

        # Log all
        logger.log_all()

    def _fit_pipeline(self, X, preprocessor, estimator):
        """Collects both `preprocessor` and `estimator` into a single
        :class:`sklearn.pipeline.Pipeline` object and fits X.
        """
        steps = [('preprocessor', preprocessor), ('estimator', estimator)]
        pipeline = Pipeline(steps)
        pipeline.fit(X)
        return pipeline

    def _create_estimator(self, forecaster, group_ids, callbacks=None):
        """Creates time series estimator.
        """
        estimator_creator = EstimatorCreator(forecaster)
        target = 'target'
        time_varying_unknown_reals = ['target']
        time_varying_known_reals = []
        static_categoricals = []

        estimator = estimator_creator.create_estimator(
            group_ids, target, time_varying_known_reals,
            time_varying_unknown_reals, static_categoricals,
            callbacks=callbacks, time_idx='time_index')
        return estimator

    def _create_preprocessor(self, group_ids, timestamp):
        """Creates sklearn preprocessor.
        """
        preprocessor_creator = PreprocessorCreator(group_ids, timestamp)
        preprocessor = preprocessor_creator.create_preprocessor()
        return preprocessor

    def _resolve_dataset(self, forecaster, user):
        """Calls :meth:`resolve` from :class:`TimeSeriesResolver`.

        Returns
        -------
        dict : str -> obj
        """
        client_args = {
            "s3_endpoint": user.s3_endpoint,
            "access_key": user.access_key,
            "secret_key": user.secret_key,
            'secure': False
        }
        client_args = ClientArgs(**client_args)
        parquet_loader = ParquetLoader(
            forecaster.dataset_group_name, forecaster.dataset_name,
            client_args)
        datasets = parquet_loader.load_all()
        timeseries_resolver = TimeSeriesResolver(**datasets)
        return timeseries_resolver.resolve()


In [10]:
user_kwargs = {
    "username": "johndoe",
    "full_name": "John Doe",
    "email": "johndoe@example.com",
    "hashed_password": "$2b$12$EixZaYVK1fsbw1ZfbX3OXePaWxn96p36WQoeG6Lruj3vjPGga31lW",
    "access_key": "johndoe",
    "secret_key": "password",
    "s3_endpoint": "minio:9000"
}
user = User(**user_kwargs)

forecaster_kwargs = {
    'task_name': 'seq2seq_training',
    'dataset_group_name': 'sample_group',
    'dataset_name': 'X_train',
    'algorithm': 'seq2seq',
    'forecast_horizon': 10,
    'perform_hpo': False,
}
forecaster = Forecaster(**forecaster_kwargs)

In [11]:
preprocessor, X = CreateForecasterTask().run(forecaster, user)

TypeError: __init__() missing 1 required positional argument: 'target'

In [32]:
Xt = pipeline['preprocessor'].transform(X)

In [98]:
c1 = X.timestamp.values.reshape(-1, 1)
c2 = X.target.values.reshape(-1, 1)

In [99]:
import numpy as np
import pandas as pd

In [100]:
gt = pipeline['preprocessor']['groups']

In [101]:
t = gt._components_getter.get_columns_dtypes(transformed=False)['timestamp']

In [111]:
def _stack_arrays(arrays):
    try:
        return np.hstack(arrays)
    except TypeError:
        obj_arrays = [arr.astype(object) for arr in arrays]
        return np.hstack(obj_arrays)

In [112]:
_stack_arrays([c1, c2])

array([[1356998400000000000, 45.0],
       [1357084800000000000, 53.0],
       [1357171200000000000, 65.0],
       ...,
       [1503619200000000000, 141.0],
       [1503705600000000000, 116.0],
       [1503792000000000000, 161.0]], dtype=object)

In [108]:
try:
    np.hstack((c1, c2))
except TypeError:
    print('a')

a


In [33]:
pipeline['preprocessor'].inverse_transform(Xt)

TypeError: The DTypes <class 'numpy.dtype[float32]'> and <class 'numpy.dtype[datetime64]'> do not have a common DType. For example they cannot be stored in a single array unless the dtype is `object`.

In [24]:
X = merger.merge()

In [39]:
ds = merger.get_dataset_by_name('target')

In [73]:
ds.schema.to_arrow_schema().field('timestamp').type

TimestampType(timestamp[ms])

In [59]:
dir(ds.schema.to_arrow_schema())

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [47]:
dir(ds.schema.to_arrow_schema())

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [41]:
dir(merger.get_arrow_schema(ds))

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [38]:
merger.get_names()

{'target': ['target'],
 'time_varying_known_reals': [],
 'time_varying_unknown_reals': [],
 'static_categoricals': []}