# training-data-analyst/courses/machine_learning/deepdive2/building_production_ml_systems

### set project and region in google cloud environment 

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

### Create BigQuery tables

In [None]:
bq = bigquery.Client(project = PROJECT)
dataset = bigquery.Dataset(bq.dataset("taxifare"))

try:
    bq.create_dataset(dataset)
    print("Dataset created")
except:
    print("Dataset already exists")

### Extracting training data to csv file

In [None]:
echo "Extracting training data to $OUTDIR"
bq --location=US extract \
   --destination_format CSV  \
   --field_delimiter "," --noprint_header \
   taxifare.feateng_training_data \
   $OUTDIR/taxi-train-*.csv

### google storage command

In [None]:
!gsutil cat gs://$BUCKET/taxifare/data/taxi-train-000000000000.csv | head -2

In [None]:
!gsutil ls gs://$BUCKET/taxifare/data

### Jupyter notebook에서 파일 쓰기

In [None]:
%%writefile {TARGET_FILENAME}
{CONTENT}

### create dataset from csv files

In [None]:
dataset = tf.data.experimental.make_csv_dataset(
    file_pattern=pattern,
    batch_size=batch_size,
    column_names=CSV_COLUMNS,
    column_defaults=DEFAULTS,
    num_epochs=num_repeat,
)

### caching dataset

In [None]:
dataset.prefetch(1)  # AUTOTUNE

### dataset link to map function

In [None]:
dataset.map(__function_name__)

### about @tf.function decorator

@tf.function는 Python 함수를 그래프로 변환해준다. 그래프 최적화(Graph Optimization) 는 만약 동일한 연산이 여기저기서 반복되는 경우 해당 연산 결과를 캐쉬(cache)로 저장해서 사용함으로써 동일 연산이 반복적으로 일어나지 않도록 한다거나, 복잡한 연산의 경우 다수의 장비에서 병렬처리(parallel on multiple devices)를 하여 연산을 빠르게 수행할 수 있도록 하여 성능을 최적화해준다.

출처: [R, Python 분석과 프로그래밍의 친구 (by R Friend)](https://rfriend.tistory.com/555)

In [None]:
def get_dayofweek(s):
    ts = parse_datetime(s)
    return DAYS[ts.weekday()]

@tf.function
def dayofweek(ts_in):
    return tf.map_fn(
        lambda s: tf.py_function(get_dayofweek, inp=[s], Tout=tf.string),
        ts_in
    )

# @tf.function
# def dayofweek(ts_in):
#     return tf.py_function(get_dayofweek, inp=[ts_in], Tout=tf.string)

# @tf.function
# def dayofweek(ts_in):
#     return get_dayofweek(ts_in)

### about tf.map_fn, tf.py_function

### Usage of tf.io.gfile

The difference is that you can specify URI schemes to use other filesystems (e.g., gs:// for GCS, s3:// for S3, etc.), if they are supported. Using file:// as an example, we have:

In [None]:
with open("/tmp/x", "w") as f:
  f.write("asdf")

with tf.io.gfile.GFile("/tmp/x") as f:
  f.read()

with tf.io.gfile.GFile("file:///tmp/x", "w") as f:
  f.write("qwert")
  f.write("asdf")
tf.io.gfile.GFile("file:///tmp/x").read()

### Run your training package on Cloud AI Platform

In [None]:
%%bash
# Output directory and jobID
OUTDIR=gs://${BUCKET}/taxifare/trained_model_$(date -u +%y%m%d_%H%M%S)
JOBID=taxifare_$(date -u +%y%m%d_%H%M%S)
echo ${OUTDIR} ${REGION} ${JOBID}
gsutil -m rm -rf ${OUTDIR}

# Model and training hyperparameters
BATCH_SIZE=50
NUM_EXAMPLES_TO_TRAIN_ON=100
NUM_EVALS=100
NBUCKETS=10
LR=0.001
NNSIZE="32 8"

# GCS paths
GCS_PROJECT_PATH=gs://$BUCKET/taxifare
DATA_PATH=$GCS_PROJECT_PATH/data
TRAIN_DATA_PATH=$DATA_PATH/taxi-train*
EVAL_DATA_PATH=$DATA_PATH/taxi-valid*

gcloud ai-platform jobs submit training $JOBID \
--module-name=trainer.task \
--package-path=taxifare/trainer \
--staging-bucket=gs://$BUCKET \
--python-version=3.7 \
--runtime-version=$TFVERSION \
--region=${REGION} \
-- \  #여기부터 task.py 파라미터
--eval_data_path $EVAL_DATA_PATH \
--output_dir $OUTDIR \
--train_data_path $TRAIN_DATA_PATH \
--batch_size 5 \
--num_examples_to_train_on 100 \
--num_evals 1 \
--nbuckets 10 \
--lr 0.001 \
--nnsize 32 8

### Train in docker container

In [None]:
!gcloud auth configure-docker

In [None]:
%%bash 

PROJECT_DIR=$(cd ./taxifare && pwd)
PROJECT_ID=$(gcloud config list project --format "value(core.project)")
IMAGE_NAME=taxifare_training_container
DOCKERFILE=$PROJECT_DIR/Dockerfile
IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_NAME

docker build $PROJECT_DIR -f $DOCKERFILE -t $IMAGE_URI

docker push $IMAGE_URI

### using cloudml-hypertune

In [None]:
!pip install cloudml-hypertune
import hypertune

In [None]:
# setting hypertune in source code
# add this code after fitting model
hp_metric = history.history['val_rmse'][num_evals-1]  # 최종 epoch 실행 후 rmse 값을 metric으로 설정
hpt = hypertune.HyperTune()
hpt.report_hyperparameter_tuning_metric(hyperparameter_metric_tag='rmse', metric_value=hp_metric)

In [None]:
# setting hypertune in yaml
%%writefile hptuning_config.yaml
trainingInput:
  scaleTier: BASIC
  hyperparameters:
    goal: MINIMIZE
    maxTrials: 30
    maxParallelTrials: 5
    hyperparameterMetricTag: rmse
    enableTrialEarlyStopping: True
    params:
    - parameterName: lr
      type: DOUBLE
      minValue: 0.0001
      maxValue: 0.01
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: nbuckets
      type: INTEGER
      minValue: 10
      maxValue: 100
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: batch_size
      type: INTEGER
      minValue: 10
      maxValue: 100
      scaleType: UNIT_LINEAR_SCALE    

#### hptune job 실행 중 oom 발생 시 

1. params 값을 작게 조정
  - 위에서 nbuckets, batch_size가 시스템 리소스와 관련이 높음
  - 값을 바꿔가면서 job을 실행해보고 최적값을 도출해야 함
2. scaleTier 변경 (https://cloud.google.com/ai-platform/training/docs/machine-types)
3. maxParallelTrials

In [None]:
# Submit training job with hyper parameters tunning

gcloud ai-platform jobs submit training $JOBID \
    --module-name=trainer.task \
    --package-path=taxifare/trainer \
    --staging-bucket=gs://$BUCKET \
    --python-version=3.7 \
    --runtime-version=$TFVERSION \
    --region=${REGION} \
    --config=hptuning_config.yaml \   # tunning 설정파일 경로 지정
    -- \
    --eval_data_path $EVAL_DATA_PATH \
    --output_dir $OUTDIR \
    --train_data_path $TRAIN_DATA_PATH \
    --batch_size $BATCH_SIZE \
    --num_examples_to_train_on $NUM_EXAMPLES_TO_TRAIN_ON \
    --num_evals $NUM_EVALS \
    --nbuckets $NBUCKETS \
    --lr $LR \
    --nnsize $NNSIZE 