In [None]:
# default_exp core

# Overview

> yoda wants to simplify the way to run jobs on Google AI platform and organize your model process in a config file.

In this session, we will go through a few examples to see how yoda works.

In [None]:
#hide
from nbdev.showdoc import *

In [115]:
#export
import click
import yaml
import blocks
import typing
import pandas as pd
import sklearn
import numpy as np
from sklearn.metrics import SCORERS

from functools import lru_cache

In [28]:
#export
@click.group()
def cli():
    pass

@cli.command()
@click.argument('config', type=click.File('r'))
def run(config):
    # process
    conf_dict = yaml.load(config, Loader=yaml.FullLoader)
    data = Data(**conf_dict['data'])
    train = Train(**conf_dict['train'])

    if "eval" in conf_dict or data.eval_path:
        eval_type = conf_dict.get("eval")
        eval_path = data.eval_path
        metrics = conf_dict["eval"].get("metrics").split(",")
        eval(eval_path, eval_type, metrics)

In [29]:
from click.testing import CliRunner

runner = CliRunner()
result = runner.invoke(run, ['../data/configs/config1.yaml'])
print(result.output)

here



## Run on local

Here is an example of a config file `config1.yaml`.

In [112]:
config1 = '../data/configs/config1.yaml'
with open(config1) as f:
    print(f.read())

data: 
  input_path: "../data/iris_data.csv"
  eval_path: "../data/iris_data.csv"
  output_path: "../output/"
  features: "sepal_length,sepal_width,petal_length"
  label: species
train:
  estimator: xgboost.XGBRegressor
  params:
    max_depth: 4
    num_estimator: 50
eval:
  metric: accuracy


We can run this config file locally by 

```{shell}
yoda run config1.yaml
```

### The following is how yoda process the config file, you can safely ignore this part.

In [113]:
# load the file
conf_dict = yaml.load(open(config1), Loader=yaml.FullLoader)

In [114]:
conf_dict

{'data': {'input_path': '../data/iris_data.csv',
  'eval_path': '../data/iris_data.csv',
  'output_path': '../output/',
  'features': 'sepal_length,sepal_width,petal_length',
  'label': 'species'},
 'train': {'estimator': 'xgboost.XGBRegressor',
  'params': {'max_depth': 4, 'num_estimator': 50}},
 'eval': {'metric': 'accuracy'}}

In [103]:
# export
class Data:
    def __init__(self, input_path: str, output_path: str, features: str, label: str, **kwargs):
        self.input_path = input_path
        self.output_path = output_path
        self.feature_list = features.split(",")
        self.label = label
        self.eval_path = kwargs.get("eval_path", None)
        self.score_path = kwargs.get("score_path", None)

    @property
    @lru_cache(1)
    def df(self):
        return blocks.assemble(self.input_path)

    @property
    def X(self):
        return self.df[self.feature_list]

    @property
    def y(self):
        return self.df[self.label]

    @property
    @lru_cache(1)
    def eval_df(self):
        if not self.eval_path:
            raise Exception("Please specify the eval_path")
        return blocks.assemble(self.eval_path)
    
    @property
    def eval_X(self):
        return self.eval_df[self.feature_list]

    @property
    def eval_y(self):
        return self.eval_df[self.label]

    @property
    @lru_cache(1)
    def score_df(self):
        if not self.score_path:
            raise Exception("Please specify the score_path")
        return blocks.assemble(self.score_path)
    
    @property
    def score_X(self):
        return self.score_df[self.feature_list]

During the ***Data*** session, yoda loads the config file and read the data from input_path. The data looks like this:

In [104]:
data = Data(**conf_dict['data'])

In [105]:
data.X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length
0,0.0,1.0,2.0
1,5.1,3.5,1.4
2,4.9,3.0,1.4
3,4.7,3.2,1.3
4,4.6,3.1,1.5


In [106]:
data.y.value_counts()

0    51
2    50
1    50
Name: species, dtype: int64

Then, it will generate an object for the ***Train*** session:

In [107]:
#export
def _import_from_string(classname: str):
    components = classname.split('.')
    mod = __import__(components[0])
    for comp in components[1:]:
        mod = getattr(mod, comp)
    return mod

class Train:
    def __init__(self, estimator: str, params: dict):
        self.estimator = _import_from_string(estimator)(**params)

    def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
        self.estimator.fit(X, y, **kwags)
    
    def predict(self, X: pd.DataFrame, **kwargs):
        self.estimator.predict(X, **kwargs)

In [108]:
train = Train(**conf_dict['train'])

In [98]:
train.fit(data.X, data.y)

In [116]:
#export
def eval(estimator: sklearn.base.BaseEstimator = None, eval_X: pd.DataFrame = None, eval_y: pd.DataFrame = None, eval_type: str = None, metrics: str = None):
    if eval_X is not None and eval_type is not None:
        raise Exception("eval_path: (%s) and eval_type: (%s) cannot co-exist" % (eval_path, eval_type))
    
    eval_res = dict()
    for metric in metrics:
        if eval_X:
            estimator.fit(data.X, data.y)
            pred_y = estimator.predict(eval_X)
            avg, sd = SCORERS["metric"](eval_y, pred_y)
        
        if eval_type:
            scores = cross_val_score(estimator, data.X, data.y, scoring="metric")
        
        eval_res[metric] = {"sd": sd, "avg": avg}
    return eval_res


if "eval" in conf_dict or data.eval_path:
    eval_type = conf_dict.get("eval")
    eval_path = data.eval_path
    conf_eval = conf_dict["eval"].get("metrics")
    metrics = conf_eval.split(",") if conf_eval else None
    eval(train.estimator, data.eval_X, data.eval_y, eval_type, metrics)

AttributeError: 'NoneType' object has no attribute 'split'

## Run on GCP AI platform

In [None]:
Before we run on AI platform, we need to create an image that have all depedencies installed.

```{shell}
export PROJECT_ID=$(gcloud config list project --format "value(core.project)")
export IMAGE_REPO_NAME=yoda
export IMAGE_TAG=basic
export IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG

docker build -f ../docker/Dockerfile.basic -t $IMAGE_URI ./
docker push $IMAGE_URI
```