In [1]:
# Add mlapi package to sys path

MLAPI_PATH = '/home/jovyan/mlapi'

import sys

if MLAPI_PATH not in sys.path:
    sys.path.append(MLAPI_PATH)

In [2]:
import requests

In [3]:
%load_ext autoreload
%autoreload 2

# Json request

In [4]:
# Dummy request.

forecaster_data = {
    'name': 'sample',
    'algorithm': 'seq2seq',
    'forecast_horizon': 10,
    'perform_hpo': False,
    'dataset_name': 'sample',
}

# ParquetLoader

In [5]:
from mlapi.datasources.s3 import ParquetLoader
from mlapi.client_args import ClientArgs
from mlapi.main import Forecaster
from minio import Minio

ModuleNotFoundError: No module named 'mlapi.datasources'

In [5]:
# Client args (authentication).

client_args = {
    "endpoint": 'minio:9000',
    "access_key": 'oxxo',
    "secret_key": 'password',
    'secure': False
}
client_args = ClientArgs(**client_args)

In [6]:
# Parquet getter: getter for parquet datasets stored in minio buckets.
parquet_loader = ParquetLoader(
    'sample', 
    client_args
)

In [7]:
parquet_loader.load('data/target/')

<pyarrow.parquet.ParquetDataset at 0xffff40badca0>

# ParquetMerger

In [24]:
from mlapi.ml._parquet_mergers import TimeSeriesMerger

In [25]:
datasets = parquet_loader.resolve_datasets()
merger = TimeSeriesMerger(**datasets)

In [26]:
X = merger.merge()

In [30]:
merger.get_names()

{'target': ['target'],
 'time_varying_known_reals': [],
 'time_varying_unknown_reals': [],
 'static_categoricals': []}

# PreprocessorCreator

In [13]:
from mlapi.ml._preprocessor import PreprocessorCreator

In [14]:
group_ids = merger.get_group_ids()
timestamp = 'timestamp'
preprocessor_creator = PreprocessorCreator(group_ids, timestamp)
preprocessor = preprocessor_creator.create_preprocessor()

# EstimatorCreator

In [31]:
from mlapi.ml._estimator import EstimatorCreator

In [44]:
estimator_creator = EstimatorCreator(predictor)

In [45]:
features_time_dependence = merger.get_names()

In [47]:
estimator = estimator_creator.create_estimator(group_ids, **features_time_dependence)

# Forecasting task

In [7]:
from mlapi.celery_app.ml.datasources.s3 import ParquetLoader
from mlapi.celery_app.ml.estimator import EstimatorCreator
from mlapi.celery_app.ml.parquet_mergers import TimeSeriesMerger
from mlapi.celery_app.ml.preprocessor import PreprocessorCreator
from mlapi.celery_app.ml.utils import add_time_index
from mlapi.celery_app.client_args import ClientArgs
from mlapi.celery_app.ml.utils import add_time_index

In [8]:
from pydantic import BaseModel
from typing import Optional


class User(BaseModel):
    username: str
    email: Optional[str] = None
    full_name: Optional[str] = None
    disabled: Optional[bool] = None
    access_key: Optional[str] = None
    secret_key: Optional[str] = None
    s3_endpoint: Optional[str] = None
    

class Forecaster(BaseModel):
    name: str
    algorithm: str
    forecast_horizon: int
    dataset_name: str
    perform_hpo: bool

In [17]:
class CreateForecasterTask:

    def run(self, forecaster, user):
        # data_merger + preprocessor + estimator
        data_merger = self._create_data_merger(forecaster, user)
        preprocessor = self._create_preprocessor(data_merger)
        estimator = self._create_estimator(forecaster, data_merger)
        return data_merger, preprocessor, estimator

        print(f'Estimator params: {vars(estimator)}')

        # Create `X` (training data).
        X = data_merger.merge()

        # Add time index to `X`.
        group_ids = data_merger.get_group_ids()
        X = add_time_index(X, group_ids=group_ids)

        # Transform `X`.
        X_transformed = preprocessor.fit_transform(X)
        print(X_transformed.groupby(['group_id__0', 'group_id__1']).size())
        print(X_transformed.head())


        # Fit estimator on `X_transformed`.
        estimator.fit(X_transformed)

    def _create_estimator(self, forecaster, data_merger, callbacks=None,
                          time_idx='time_idx'):
        estimator_creator = EstimatorCreator(forecaster)
        group_ids = data_merger.get_group_ids()
        target = 'target'
        time_varying_unknown_reals = ['target']
        time_varying_known_reals = []
        static_categoricals = []

        estimator = estimator_creator.create_estimator(
            group_ids, target, time_varying_known_reals,
            time_varying_unknown_reals, static_categoricals,
            callbacks=callbacks, time_idx=time_idx)
        return estimator

    def _create_preprocessor(self, data_merger, timestamp='timestamp'):
        group_ids = data_merger.get_group_ids()
        preprocessor_creator = PreprocessorCreator(group_ids)
        preprocessor = preprocessor_creator.create_preprocessor(timestamp=timestamp)
        return preprocessor

    def _create_data_merger(self, forecaster, user):
        client_args = {
            "s3_endpoint": user.s3_endpoint,
            "access_key": user.access_key,
            "secret_key": user.secret_key,
            'secure': False
        }
        client_args = ClientArgs(**client_args)
        parquet_loader = ParquetLoader(forecaster.dataset_name, client_args)
        datasets = parquet_loader.resolve_datasets()
        data_merger = TimeSeriesMerger(**datasets)
        return data_merger


In [18]:
user_kwargs = {
    "username": "johndoe",
    "full_name": "John Doe",
    "email": "johndoe@example.com",
    "hashed_password": "$2b$12$EixZaYVK1fsbw1ZfbX3OXePaWxn96p36WQoeG6Lruj3vjPGga31lW",
    "access_key": "johndoe",
    "secret_key": "password",
    "s3_endpoint": "minio:9000"
}
user = User(**user_kwargs)

forecaster_kwargs = {
    'name': 'sample',
    'algorithm': 'seq2seq',
    'forecast_horizon': 10,
    'perform_hpo': False,
    'dataset_name': 'sample',
}
forecaster = Forecaster(**forecaster_kwargs)

In [20]:
merger, preprocessor, estimator = CreateForecasterTask().run(forecaster, user)

In [24]:
X = merger.merge()

In [39]:
ds = merger.get_dataset_by_name('target')

In [73]:
ds.schema.to_arrow_schema().field('timestamp').type

TimestampType(timestamp[ms])

In [59]:
dir(ds.schema.to_arrow_schema())

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [47]:
dir(ds.schema.to_arrow_schema())

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [41]:
dir(merger.get_arrow_schema(ds))

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_export_to_c',
 '_field',
 '_import_from_c',
 'add_metadata',
 'append',
 'empty_table',
 'equals',
 'field',
 'field_by_name',
 'from_pandas',
 'get_all_field_indices',
 'get_field_index',
 'insert',
 'metadata',
 'names',
 'pandas_metadata',
 'remove',
 'remove_metadata',
 'serialize',
 'set',
 'to_string',
 'types',
 'with_metadata']

In [38]:
merger.get_names()

{'target': ['target'],
 'time_varying_known_reals': [],
 'time_varying_unknown_reals': [],
 'static_categoricals': []}