# Training Pipeline

**SageMaker Studio Kernel**: Data Science

In this exercise you will do:
 - Create/Run an Amazon SageMaker Pipeline [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html)
 - Compute the thresholds, used by the applicatio to classify the predictions as anomalies or normal behavior


The following diagram shows all the steps we're going to execute:  
![Pipeline](./../imgs/ggv2_lab2_train_pipeline.png)

***

## Part 1/3 - Setup
Here we'll import some libraries and define some variables.

In [None]:
import boto3
import json
import logging
import os
import sagemaker
from sagemaker import get_execution_role
import sys

In [None]:
sys.path.insert(0, os.path.abspath('./../mlpipelines'))

In [None]:
from training.pipeline import get_pipeline

In [None]:
s3_client = boto3.client('s3')
sm_client = boto3.client('sagemaker')

In [None]:
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

***

## Part 2/3 - Create Amazon SageMaker Pipeline

### Pipeline Parameters

In [None]:
region = boto3.session.Session().region_name
role = sagemaker.get_execution_role()

model_package_group_name = "mlops-iot-package-group"

preprocessing_framework_version = "0.23-1"
preprocessing_instance_type = "ml.m5.xlarge"
preprocessing_instance_count = 1
preprocessing_input_files_path = "data/input"
preprocessing_entrypoint = "./../algorithms/preprocessing/preprocessing.py"

postprocessing_output_files_path = "data/output"

training_framework_version = "1.6.0"
training_python_version = "py3"
training_instance_type = "ml.c5.4xlarge"
training_instance_count = 1
training_hyperparameters = {
    'k_fold_splits': 6,
    'k_index_only': 3, # after running some experiments with this dataset, it makes sense to fix it
    'num_epochs': 20,
    'batch_size': 256,
    'learning_rate': 0.0001,
    'dropout_rate': 0.001
}
training_metrics = [
    {'Name': 'train_loss:mse', 'Regex': ' train_loss=(\S+);'},
    {'Name': 'test_loss:mse', 'Regex': ' test_loss=(\S+);'}
]
training_entrypoint = "./../algorithms/training/wind_turbine.py"

transform_instance_type = "ml.c5.xlarge"
transform_instance_count = 2

s3_bucket_name = ""

### Training pipeline

#### Get pipeline definition

In [None]:
pipeline = get_pipeline(
    region,
    model_package_group_name,
    preprocessing_framework_version,
    preprocessing_instance_count,
    preprocessing_instance_type,
    preprocessing_input_files_path,
    preprocessing_entrypoint,
    postprocessing_output_files_path,
    training_framework_version,
    training_python_version,
    training_instance_count,
    training_instance_type,
    training_entrypoint,
    transform_instance_count,
    transform_instance_type,
    s3_bucket_name,
    training_hyperparameters,
    training_metrics,
    role,
    pipeline_name="MLOpsIotBuildTrain"
)

### Create or update SageMaker pipeline

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
json.loads(pipeline.definition())

#### Start training pipeline 

In [None]:
execution = pipeline.start()

In [None]:
execution.describe()

In [None]:
execution.list_steps()

#### Start training pipeline and overriding parameters

In [None]:
args = {
    "ModelApprovalStatus": "PendingManualApproval",
}

In [None]:
execution = pipeline.start(
    parameters=args
)

## Part 3/3 - Compute the threshold based on MAE

### Download the predictions & Compute MAE/thresholds

In [None]:
pipeline_execution_arn = execution.arn
print(pipeline_execution_arn)

execution_id = pipeline_execution_arn.split('/')[-1]
training_jobs = sm_client.list_training_jobs(NameContains=execution_id, StatusEquals='Completed')['TrainingJobSummaries']

assert(len(training_jobs) == 1) # it must have exactly one training job
training_job_name=training_jobs[0]['TrainingJobName']

# We will recreate the estimator, based on the training job
estimator = sagemaker.estimator.Estimator.attach(
    training_job_name=training_job_name, 
    sagemaker_session=sagemaker_session
)

tokens = input_data.split('/', 3)
sagemaker_session.download_data(bucket=bucket_name, key_prefix='data/output/eval/', path='./../data/preds/')
sagemaker_session.download_data(bucket=bucket_name, key_prefix=tokens[3], path='./../data/input/')

In [None]:
import numpy as np
import glob

x_inputs = np.vstack([np.load(i) for i in glob.glob('./../data/input/*.npy')])
y_preds = np.vstack([np.load(i) for i in glob.glob('./../data/preds/*.out')])

n_samples,n_features,n_rows,n_cols = x_inputs.shape

x_inputs = x_inputs.reshape(n_samples, n_features, n_rows*n_cols).transpose((0,2,1))
y_preds = y_preds.reshape(n_samples, n_features, n_rows*n_cols).transpose((0,2,1))

mae_loss = np.mean(np.abs(y_preds - x_inputs), axis=1).transpose((1,0))
mae_loss[np.isnan(mae_loss)] = 0

thresholds = np.mean(mae_loss, axis=1)

if not(os.path.exists("./../data/statistics")):
    os.mkdir("./../data/statistics")

np.save('./../data/statistics/thresholds.npy', thresholds)
print(",".join(thresholds.astype(str)))