In [None]:
# default_exp core

In [None]:
%load_ext autoreload
%autoreload 2

# Overview

> yoda wants to simplify the way to run jobs on Google AI platform and organize your model process in a config file.

In this session, we will go through a few examples to see how yoda works.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
import pickle
import typing
from functools import lru_cache
from io import TextIOWrapper

import blocks
import numpy as np
import pandas as pd
import sklearn
import yaml
from blocks.filesystem import GCSFileSystem
from sklearn.model_selection import cross_val_score

In [None]:
# export
class Data:
    def __init__(self, input_path: str, output_path: str, features: str, label: str, **kwargs):
        self.input_path = input_path
        self.output_path = output_path
        self.feature_list = features.split(",")
        self.label = label
        self.eval_path = kwargs.get("eval_path", None)
        self.score_path = kwargs.get("score_path", None)
        self.is_gcp = input_path.startswith("gs://")

    @property
    @lru_cache(1)
    def df(self):
        return blocks.assemble(self.input_path)

    @property
    def X(self):
        return self.df[self.feature_list]

    @property
    def y(self):
        return self.df[self.label]

    @property
    @lru_cache(1)
    def eval_df(self):
        if not self.eval_path:
            raise Exception("Please specify the eval_path")
        return blocks.assemble(self.eval_path)

    @property
    def eval_X(self):
        return self.eval_df[self.feature_list]

    @property
    def eval_y(self):
        return self.eval_df[self.label]

    @property
    @lru_cache(1)
    def score_df(self):
        if not self.score_path:
            raise Exception("Please specify the score_path")
        return blocks.assemble(self.score_path)

    @property
    def score_X(self):
        return self.score_df[self.feature_list]

    def open(self, filename) -> TextIOWrapper:
        full_path = os.path.join(self.output_path, filename)
        opener = GCSFileSystem().open if self.is_gcp else open
        with opener(full_path) as fobj:
            yield fobj

In [None]:
#export
def _import_from_string(classname: str):
    components = classname.split('.')
    mod = __import__(components[0])
    for comp in components[1:]:
        mod = getattr(mod, comp)
    return mod


class Train:
    def __init__(self, estimator: str, params: dict):
        self.estimator = _import_from_string(estimator)(**params)

    def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
        self.estimator.fit(X, y, **kwargs)

    def predict(self, X: pd.DataFrame, **kwargs):
        self.estimator.predict(X, **kwargs)

    def save(self, fobj):
        pickle.dump(self.estimator, fobj)

In [None]:
#export
def _eval(estimator: sklearn.base.BaseEstimator = None,
          data: Data = None,
          cv=None,
          metrics: str = None) -> dict:
    if data.eval_path is not None and cv is not None:
        raise Exception(
            "eval_path: (%s) and cv: (%s) cannot co-exist" % (data.eval_path, cv))

    eval_res = dict()
    for metric in metrics:
        if data.eval_path is not None:
            estimator.fit(data.X, data.y)
            scorer = sklearn.metrics.SCORERS[metric]
            avg, sd = scorer(estimator, data.eval_X, data.eval_y), 0
        if cv is not None:
            scores = cross_val_score(
                estimator, data.X, data.y, cv=cv, scoring=metric)
            avg, sd = np.mean(scores), np.std(scores)

        eval_res[metric] = {"sd": sd, "avg": avg}
    return eval_res


def run_eval(conf_dict: dict, data: Data, estimator, output_dir: str = "eval.pkl"):
    eval_path = data.eval_path
    metrics_str = conf_dict["eval"].get("metrics")
    cv = conf_dict["eval"].get("cv")
    metrics = metrics_str.split(",") if metrics_str else None
    result = _eval(estimator, data, cv, metrics)
    conf_cp = dict(conf_dict)
    conf_cp["eval_result"] = result
    # TODO: consider to create an class here and do
    # evaluate.save()
    if output_dir:
        pickle.dump(conf_cp, data.open(output_dir))
    return result


def run_on_dict(conf_dict: dict):
    data = Data(**conf_dict['data'])
    train = Train(**conf_dict['train'])
    if "eval" in conf_dict or data.eval_path:
        run_eval(conf_dict, data, train.estimator)
    else:
        train.fit(data.X, data.y)
        train.save(data.open("model.pkl"))

In [None]:
#export
class FormatTag(yaml.YAMLObject):
    """
    This tag supporting: NOW, EPOCH, and anything from environment variable
    """
    yaml_tag = u'!format'
    yaml_loader = yaml.SafeLoader

    @classmethod
    def from_yaml(cls, loader, node):
        import calendar
        import time

        fillin_dict = dict(os.environ)
        update_dict = {
            "NOW": time.strftime("%Y%m%d_%H%M%S"),
            "EPOCH": calendar.timegm(time.gmtime()),
        }
        fillin_dict.update(update_dict)
        values = loader.construct_scalar(node)
        return values.format(**fillin_dict)

## Run on local

Here is an example of a config file `config1.yaml`.

In [None]:
config1 = '../data/configs/config1.yaml'
with open(config1) as f:
    print(f.read())

data: 
  input_path: "../data/iris_data.csv"
  eval_path: "../data/iris_data.csv"
  output_path: "../output/"
  features: "sepal_length,sepal_width,petal_length"
  label: species
train:
  estimator: xgboost.XGBClassifier
  params:
    max_depth: 4
    num_estimator: 50
eval:
  metrics: "accuracy,f1_macro"


We can run this config file locally by 

```{shell}
yoda run config1.yaml
```

### The following is how yoda process the config file, you can safely ignore this part.

In [None]:
# load the file
conf_dict = yaml.load(open(config1), Loader=yaml.SafeLoader)

In [None]:
conf_dict

{'data': {'input_path': '../data/iris_data.csv',
  'eval_path': '../data/iris_data.csv',
  'output_path': '../output/',
  'features': 'sepal_length,sepal_width,petal_length',
  'label': 'species'},
 'train': {'estimator': 'xgboost.XGBClassifier',
  'params': {'max_depth': 4, 'num_estimator': 50}},
 'eval': {'metrics': 'accuracy,f1_macro'}}

During the ***Data*** session, yoda loads the config file and read the data from input_path. The data looks like this:

In [None]:
data = Data(**conf_dict['data'])

In [None]:
data.X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length
0,0.0,1.0,2.0
1,5.1,3.5,1.4
2,4.9,3.0,1.4
3,4.7,3.2,1.3
4,4.6,3.1,1.5


In [None]:
data.y.value_counts()

0    51
2    50
1    50
Name: species, dtype: int64

Then, it will generate an object for the ***Train*** session:

In [None]:
train = Train(**conf_dict['train'])

In [None]:
train.fit(data.X, data.y)

In [None]:
run_eval(conf_dict, data, train.estimator, output_dir=None)

{'accuracy': {'sd': 0, 'avg': 1.0}, 'f1_macro': {'sd': 0, 'avg': 1.0}}

In [None]:
conf_dict["eval"]["cv"] = 5
data.eval_path = None

In [None]:
run_eval(conf_dict, data, train.estimator, output_dir=None)

{'accuracy': {'sd': 0.0574503560948385, 'avg': 0.9402150537634408},
 'f1_macro': {'sd': 0.05748872061476293, 'avg': 0.9398830409356724}}

## Run on GCP AI platform

Before we run on AI platform, we need to create an image that have all depedencies installed.

```{shell}
export PROJECT_ID=$(gcloud config list project --format "value(core.project)")
export IMAGE_REPO_NAME=yoda
export IMAGE_TAG=basic
export IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG

docker build -f ../docker/Dockerfile.basic -t $IMAGE_URI ./
docker push $IMAGE_URI
```

The config for GCP looks like this:

In [None]:
config2 = '../data/configs/config2.yaml'
with open(config2) as f:
    print(f.read())

data: 
  input_path: !format "gs://{BUCKET}/{USER}/test/iris_data.csv"
  eval_path: !format "gs://{BUCKET}/{USER}/test/iris_data.csv"
  output_path: !format "gs://{BUCKET}/{USER}/test/output/"
  features: "sepal_length,sepal_width,petal_length"
  label: species
train:
  estimator: xgboost.XGBClassifier
  params:
    max_depth: 4
    num_estimator: 50
eval:
  metrics: "accuracy,f1_macro"


In [None]:
os.environ["BUCKET"] = "testjobsubmit"
conf_dict2 = yaml.safe_load(open(config2))

In [None]:
conf_dict2

{'data': {'input_path': 'gs://testjobsubmit/j0l04cl/test/iris_data.csv',
  'eval_path': 'gs://testjobsubmit/j0l04cl/test/iris_data.csv',
  'output_path': 'gs://testjobsubmit/j0l04cl/test/output/',
  'features': 'sepal_length,sepal_width,petal_length',
  'label': 'species'},
 'train': {'estimator': 'xgboost.XGBClassifier',
  'params': {'max_depth': 4, 'num_estimator': 50}},
 'eval': {'metrics': 'accuracy,f1_macro'}}

In [None]:
import yoda

ModuleNotFoundError: No module named 'yoda'

In [None]:
#export
from yoda.runner import run_yoda_on_gcp

run_yoda_on_gcp(conf_dict2)

ModuleNotFoundError: No module named 'yoda'

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_run.ipynb.
