## Note
This notebook is a copy of `train.py` with some modifications. This notebook can be run on ABEJA Platform's Jupyter Notebook. If you want to run this on your local machine, set these environment variables.

| env | type | description |
| --- | --- | --- |
| ABEJA_ORGANIZATION_ID | str | Your organization ID. |
| ABEJA_PLATFORM_USER_ID | str | Your user ID. |
| ABEJA_PLATFORM_PERSONAL_ACCESS_TOKEN | str | Your Access Token. |

## Step 0: Preparations

In [None]:
!pip install -r requirements.txt

## Step 1: Write your code here!

In [None]:
# You MUST set these parameters.
# If you want to change the configurations, you MUST set environment variables in this cell.

import os

os.environ['DATALAKE_CHANNEL_ID'] = 'xxx'
os.environ['DATALAKE_TRAIN_FILE_ID'] = 'xxx'
# os.environ['DATALAKE_VAL_FILE_ID'] = 'xxx'
# os.environ['INPUT_FIELDS'] = 'var_1,var_2,var_3'
os.environ['LABEL_FIELD'] = 'target'

from parameters import Parameters

In [None]:
import os
import json
from pathlib import Path

import lightgbm as lgb
from tensorboardX import SummaryWriter

from callbacks import Statistics, TensorBoardCallback
from data_loader import train_data_loader
from parameters import Parameters


ABEJA_STORAGE_DIR_PATH = os.getenv('ABEJA_STORAGE_DIR_PATH', '~/.abeja/.cache')
ABEJA_TRAINING_RESULT_DIR = os.getenv('ABEJA_TRAINING_RESULT_DIR', 'abejainc_training_result')
Path(ABEJA_TRAINING_RESULT_DIR).mkdir(exist_ok=True)

DATALAKE_CHANNEL_ID = Parameters.DATALAKE_CHANNEL_ID
DATALAKE_TRAIN_FILE_ID = Parameters.DATALAKE_TRAIN_FILE_ID
DATALAKE_VAL_FILE_ID = Parameters.DATALAKE_VAL_FILE_ID
INPUT_FIELDS = Parameters.INPUT_FIELDS
LABEL_FIELD = Parameters.LABEL_FIELD
PARAMS = Parameters.as_params()

STRATIFIED = True if Parameters.STRATIFIED and Parameters.IS_CLASSIFICATION else False
IS_MULTI = Parameters.OBJECTIVE.startswith("multi")

statistics = Statistics(Parameters.NUM_ITERATIONS)

log_path = os.path.join(ABEJA_TRAINING_RESULT_DIR, 'logs')
writer = SummaryWriter(log_dir=log_path)

In [None]:
class ModelExtractionCallback(object):
    """
    original author : momijiame
    ref : https://blog.amedama.jp/entry/lightgbm-cv-model
    description : Class for callback to extract trained models from lightgbm.cv(). 
    note: This class depends on private class '_CVBooster', so there are some future risks. 
    
    usage:
        extraction_cb = ModelExtractionCallback()
        callbacks = [extraction_cb,]
    
        lgb.cv(params, dtrain, nfold=5, 
               num_boost_round=9999,
               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
               verbose_eval=verbose_eval,
               callbacks=callbacks,
               seed=0)
        
        models = extraction_cb.raw_boosters
    
    """

    def __init__(self):
        self._model = None

    def __call__(self, env):
        # _CVBooster の参照を保持する
        self._model = env.model

    def _assert_called_cb(self):
        if self._model is None:
            # コールバックが呼ばれていないときは例外にする
            raise RuntimeError('callback has not called yet')

    @property
    def boosters_proxy(self):
        self._assert_called_cb()
        # Booster へのプロキシオブジェクトを返す
        return self._model

    @property
    def raw_boosters(self):
        self._assert_called_cb()
        # Booster のリストを返す
        return self._model.boosters

    @property
    def best_iteration(self):
        self._assert_called_cb()
        # Early stop したときの boosting round を返す
        return self._model.best_iteration

In [None]:
print(f'start training with parameters : {Parameters.as_dict()}')

In [None]:
X_train, y_train, cols_train = train_data_loader(
    DATALAKE_CHANNEL_ID, DATALAKE_TRAIN_FILE_ID, LABEL_FIELD, INPUT_FIELDS)

In [None]:
X_train.head()

In [None]:
y_train

In [None]:
dtrain = lgb.Dataset(X_train, y_train)

if DATALAKE_VAL_FILE_ID:
    X_val, y_val, _ = train_data_loader(
        DATALAKE_CHANNEL_ID, DATALAKE_VAL_FILE_ID, LABEL_FIELD, INPUT_FIELDS)
else:
    X_val, y_val = None, None

extraction_cb = ModelExtractionCallback()
tensorboard_cb = TensorBoardCallback(statistics, writer)
tensorboard_cb.set_valid(X_val, y_val, Parameters.IS_CLASSIFICATION, IS_MULTI, Parameters.NUM_CLASS)
callbacks = [extraction_cb, tensorboard_cb,]

In [None]:
lgb.cv(PARAMS, dtrain, nfold=Parameters.NFOLD,
       early_stopping_rounds=Parameters.EARLY_STOPPING_ROUNDS,
       verbose_eval=Parameters.VERBOSE_EVAL,
       stratified=STRATIFIED,
       callbacks=callbacks,
       metrics=Parameters.METRIC,
       seed=Parameters.SEED)

In [None]:
models = extraction_cb.raw_boosters
for i,model in enumerate(models):
    model.save_model(os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.txt'))

di = {
    **(Parameters.as_dict()),
    'cols_train': cols_train
}
lgb_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'lgb_env.json'), 'w')
json.dump(di, lgb_env)
lgb_env.close()

In [None]:
def handler(context):
    print("finish.")

## Step 2: Run this on ABEJA Platform
After finishing your debugging, now you are ready to run this on ABEJA Platform. Run below commands.

### Convert this to python file

In [None]:
import os
from IPython.display import display, HTML

js = """<script>IPython.notebook.kernel.execute("file_name=('" + IPython.notebook.notebook_name + "')");</script>"""
display(HTML(js))

In [None]:
os.environ['FILE_NAME'] = file_name

In [None]:
!jupyter nbconvert $FILE_NAME --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags='["remove_cell"]' --to python

### Throw your job!
You can check your job status on ABEJA Platform.

In [None]:
import os
from abeja.train import APIClient

api = APIClient()

In [None]:
_organization_id = os.environ['ABEJA_ORGANIZATION_ID']
_job_definition_name = os.environ['TRAINING_JOB_DEFINITION_NAME']
_handler = "{}:handler".format(os.environ['FILE_NAME'][:-6])
_image = "abeja-inc/all-gpu:19.04"
_description = "initial version"
_environment = Parameters.as_env()
_environment

In [None]:
_files = [
    'callbacks.py',
    'data_loader.py',
    'parameters.py',
    'requirements.txt',
    '{}.py'.format(os.environ['FILE_NAME'][:-6]),
    'utils.py'
]

In [None]:
# Register current codes
version = api.create_training_job_definition_version(
    organization_id=_organization_id, job_definition_name=_job_definition_name, filepaths=_files, 
    handler=_handler, image=_image, environment=_environment, description=_description)
version

In [None]:
# Run job
# You can run job in parallel for hyperparameter tuning. Each request creates a training job
# at ABEJA Platform

user_parameters = {}
job = api.create_training_job(
    organization_id=_organization_id, job_definition_name=_job_definition_name, 
    version_id=version['job_definition_version'], user_parameters=user_parameters)
job