# 1. Import Libraries

In [None]:
# General Data Manipulation Libraries
import numpy as np
import pandas as pd

# Model & Helper Libraries
import os
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Plotting Tools
import matplotlib.pyplot as plt

# Sagemaker Unique Libraries
import boto3
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import CreateModelStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString

# 2. Configure Boto3 Clients and Sessions

In [None]:
region = boto3.Session().region_name
smclient = boto3.Session().client("sagemaker")

sagemaker_role = sagemaker.get_execution_role()

bucket = "<Bucket>"
s3_client = boto3.client("s3", region_name=region)

print(f'AWS Region name : {region},\nSession : {smclient},\nRole : {sagemaker_role}')

# 3. Setting up XGBoost Estimator

In [None]:
# Sagemaker Session
sess = sagemaker.Session(default_bucket="sagemaker-santander")

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")


xgb = sagemaker.estimator.Estimator(
    container,
    sagemaker_role    ,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)

# 4. Pipeline

## 4.1 Reading and Setting up data (Step 1)

In [None]:
s3_client.upload_file(Filename="prepare_data.py", Bucket=bucket, Key="code/prepare_data.py")
prepare_data_script_uri = f"s3://{bucket}/code/prepare_data.py"
prepare_data_instance_type = "ml.t3.large"

In [None]:
input_data_uri = f"s3://{bucket}/input-data/"
input_data = ParameterString(name="InputData", default_value=input_data_uri)

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type=prepare_data_instance_type,
    instance_count=1,
    base_job_name="xgboost-classication-prepare-data",
    role=sagemaker_role,
)

prepare_data_step = ProcessingStep(
    name="PrepareData",
    processor=sklearn_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=input_data, destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="train_data", source="/opt/ml/processing/output/data/train"
        ),
        sagemaker.processing.ProcessingOutput(
            output_name="val_data", source="/opt/ml/processing/output/data/val"
        )        
    ],
    code=prepare_data_script_uri,
)

## 4.2 Model Training (Step 2)

In [None]:
train_step_inputs = {
    "train": sagemaker.inputs.TrainingInput(
        s3_data=prepare_data_step.properties.ProcessingOutputConfig.Outputs[
            "train_data"
        ].S3Output.S3Uri,
        content_type="csv"              
    ),
    "validation": sagemaker.inputs.TrainingInput(
        s3_data=prepare_data_step.properties.ProcessingOutputConfig.Outputs[
            "val_data"
        ].S3Output.S3Uri,
        content_type="csv"              
    ),
}

train_step = TrainingStep(name="TrainModel", estimator=xgb, inputs=train_step_inputs)

In [None]:
prepare_data_step.properties

## 4.3 Register Model (Step 3)

In [None]:
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

mpg_name = "Sagemaker-Demo-Project-p-fg9kinqqtfqd"

register_step = RegisterModel(
    name="RegisterModel",
    estimator=xgb,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=mpg_name,
    approval_status=model_approval_status,
)

## 4.4 Create Model (Step 4)

In [None]:
model = sagemaker.model.Model(
    name=f"{mpg_name}-pipline",
    image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sess,
    role=sagemaker_role,
)

inputs = sagemaker.inputs.CreateModelInput(instance_type="ml.m4.xlarge")

create_model_step = CreateModelStep(name="ModelPreDeployment", model=model, inputs=inputs)

## 4.5 Deploy Model (Step 5)

In [None]:
s3_client.upload_file(
    Filename="deploy_model.py", Bucket=bucket, Key=f"/code/deploy_model.py"
)
deploy_model_script_uri = f"s3://{bucket}/code/deploy_model.py"
deploy_instance_type = "ml.m5.large"

deploy_model_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=sagemaker_role,
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="deploy-model",
    sagemaker_session=sess,
)

deploy_step = ProcessingStep(
    name="DeployModel",
    processor=deploy_model_processor,
    job_arguments=[
        "--model-name",
        create_model_step.properties.ModelName,
        "--region",
        region,
        "--endpoint-instance-type",
        deploy_instance_type,
        "--endpoint-name",
        "cv-model-pipeline",
    ],
    code=deploy_model_script_uri,
)

## 4.6 Create Pipeline

In [None]:
pipeline_name = "santander-pipeline"

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[input_data, model_approval_status],
    steps=[prepare_data_step, train_step],
)

pipeline.upsert(role_arn=sagemaker_role)

In [None]:
parameters = {"ModelApprovalStatus": "Approved"}

start_response = pipeline.start(parameters=parameters)
start_response.wait(max_attempts=100)
start_response.describe()

# 5. Model Setup

In [None]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")


xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)

## 5.1 Hyper-paramter Tuning

In [None]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 4),
}

objective_metric_name = "validation:auc"

tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, max_jobs=2, max_parallel_jobs=3
)

# 6. Model Training

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/train".format(bucket),
    content_type="csv",

)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/validation".format(bucket), 
    content_type="csv",

)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)

# 8. Model Deployment

In [None]:
results = sagemaker.analytics.HyperparameterTuningJobAnalytics(tuning_job_name)
results_df = results.dataframe()
best_training_job_summary = results.description()["BestTrainingJob"]


In [None]:
best_training_job_summary

In [None]:
# Attach to an existing hyperparameter tuning job.
tuning_job_details = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)
xgb_tuner = HyperparameterTuner.attach(
    tuning_job_name,
    job_details=tuning_job_details,
    sagemaker_session=sagemaker.Session(),
    estimator_cls=None,
)

# Get the best XGBoost training job name from the HPO job
xgb_best_training_job = xgb_tuner.best_training_job()
print(xgb_best_training_job)
# Attach estimator to the best training job name
best_estimator = sagemaker.estimator.Estimator.attach(xgb_best_training_job)

# Create model to be passed to the inference pipeline
best_model = sagemaker.model.Model(
    model_data=best_estimator.model_data,
    role=sagemaker.get_execution_role(),
    image_uri=best_estimator.image_uri,
)

predictor = best_model.deploy(initial_instance_count=1, instance_type="ml.m5.large")