In [None]:
import boto3
import sagemaker

## Set up S3 Bucket

In [None]:
account_id = boto3.client('sts').get_caller_identity()["Account"]

In [None]:
region = sagemaker.Session().boto_region_name

In [None]:
S3_BUCKET_NAME = f"train-inference-pipeline-{account_id}"
GLUE_CRAWLER_NAME = "glue-crawler-tif"
DATABASE = S3_BUCKET_NAME
REGION = "ap-southeast-2"

In [None]:
try:
    s3_client = boto3.client('s3', region_name=region)
    s3_client.create_bucket(Bucket=S3_BUCKET_NAME,
                            ACL='private',
                            CreateBucketConfiguration={'LocationConstraint': region})
    print(f'Create S3 bucket {S3_BUCKET_NAME}: SUCCESS')
    
except Exception as e:
    if e.response['Error']['Code'] == 'BucketAlreadyOwnedByYou':
        print(f'Using existing bucket: {S3_BUCKET_NAME}')
    else:
        raise(e)

## Fetch Synthetic Sample Data

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt ./data/

### Split Data

In [None]:
import os

os.makedirs("./data/train/", exist_ok=True)
os.makedirs("./data/infer/", exist_ok=True)

In [None]:
import pandas as pd

df_churn = pd.read_csv("../data/churn.txt", header=0)
df_churn.sample(frac=1).reset_index(drop=True, inplace=True)
df_train, df_test = df_churn[:100], df_churn[100:]

df_train.to_csv("./data/train/churn_train.txt", index=False)
df_test = df_test.drop(df_test.columns[-1], axis=1)
df_test.to_csv("./data/infer/churn_test.txt", index=False)

### Upload Data to S3

In [None]:
!cd .. && aws s3 sync ./data s3://{S3_BUCKET_NAME}/demo/

## Setup Athena

In [None]:
paras = [
  {
    "ParameterKey": "DataBucketName",
    "ParameterValue": S3_BUCKET_NAME,
  },
]

In [None]:
import json
with open('paras.json', 'w') as fp:
    json.dump(paras, fp)
    
!cat paras.json

In [None]:
!aws cloudformation --region {REGION} create-change-set \
--stack-name "tip" \
--change-set-name ImportChangeSet \
--change-set-type IMPORT \
--resources-to-import "[{\"ResourceType\":\"AWS::Athena::WorkGroup\",\"LogicalResourceId\":\"AthenaPrimaryWorkGroup\",\"ResourceIdentifier\":{\"Name\":\"primary\"}}]" \
--parameters file://paras.json \
--template-body file://../cfn/01-athena.yaml

In [None]:
!rm paras.json

**Wait for the cloudformation stack creation complete before executing the following command**

In [None]:
!aws cloudformation--region {REGION} execute-change-set --change-set-name ImportChangeSet --stack-name "tip"

## Setup Glue

In [None]:
cfn_stack_name = "tip-glue"

In [None]:
!aws cloudformation --region "ap-southeast-2" create-stack \
--stack-name {cfn_stack_name} \
--template-body file://../cfn/02-crawler.yaml \
--capabilities CAPABILITY_NAMED_IAM \
--parameters ParameterKey=RawDataBucketName,ParameterValue={S3_BUCKET_NAME}\
ParameterKey=CrawlerName,ParameterValue={GLUE_CRAWLER_NAME}

### Start Glue Crawler

**Wait for the glue Crawler Creation Complete before starting the crawler with the following command.**

In [None]:
!aws glue --region {REGION} start-crawler --name {GLUE_CRAWLER_NAME}

**Wait for the Glue Crawler being stopped before querying the Athena Database**

In [None]:
query_exec_id = !aws athena --region {REGION} start-query-execution --query-string "SELECT * FROM train limit 3;" --query-execution-context Database={DATABASE}
query_exec_id = eval(" ".join(query_exec_id))["QueryExecutionId"]
query_exec_id 

In [None]:
!aws athena --region {REGION} get-query-results --query-execution-id {query_exec_id}

## Upload Scripts to S3

For different dataset, update 

- `../script/preprocessing.py`
- `../script/inferpreprocessing.py` 

In [None]:
!cd .. && aws s3 sync ./scripts s3://{S3_BUCKET_NAME}/script/

## Deployment

In [None]:
!aws s3 cp ../cfn/pipeline.yaml s3://{S3_BUCKET_NAME}/cfn/

In [None]:
!aws --region {REGION} cloudformation create-stack \
--stack-name "tip-syd" \
--template-url https://{S3_BUCKET_NAME}.s3-{REGION}.amazonaws.com/cfn/pipeline.yaml \
--capabilities CAPABILITY_NAMED_IAM \
--parameters ParameterKey=AthenaDatabaseName,ParameterValue={DATABASE} \
ParameterKey=PipelineBucketName,ParameterValue={S3_BUCKET_NAME} \
--disable-rollback

## Trigger Training

In [None]:
!aws lambda --region "ap-southeast-2" invoke --function-name invokeTrainingStepFunction --payload '{ "": ""}' out

## Trigger Inference

In [None]:
!aws lambda --region "ap-southeast-2" invoke --function-name invokeInferStepFunction --payload '{ "": ""}' out