# Template for creating a data processing script

In [None]:
import os
import boto3
import json
from io import BytesIO
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sagemaker import get_execution_role, local, utils, Session
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
bucketName = os.getenv('S3_NAME')
print(bucketName)
session = Session()
default_bucket = bucketName
role = get_execution_role()
region = session.boto_region_name
client = session.boto_session.client(
    "sts", region_name=region, endpoint_url=utils.sts_regional_endpoint(region)
    )
account = client.get_caller_identity()['Account']

## Setup a project folder

In [None]:
algorithm = os.getenv('ECR_NAME')
directory = f'containers/{algorithm}'
if not os.path.exists(directory):
    os.makedirs(directory)
ecr_script_name = 'publish-ecr.sh'
ecr_script_path = f'{directory}/{ecr_script_name}'

In [None]:
%%writefile $ecr_script_path
#!/usr/bin/env bash
set -u
# The name of the repository where we'll publish container images
REPOSITORY_NAME="$1"
# The image tag (for example, for versioning)
IMAGE_TAG=latest
# The identifier of the account where we want to publish container images
# This is pulled automatically from the same account where this notebook is running
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
# The current region (defaults to eu-west-1, if none defined)
REGION=$(aws configure get region)
# Build the base repository URI
ECR_BASE="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com"
# Build the entire URI for the container image
ECR_FULL="${ECR_BASE}/${REPOSITORY_NAME}:${IMAGE_TAG}"
# No command should fail from here onwards
set -e
# Log in to Amazon ECR
# https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html
aws ecr get-login-password --region ${REGION} | \
docker login "${ECR_BASE}" --username AWS --password-stdin 2>/dev/null
# Build Docker image
docker build -t ${REPOSITORY_NAME} -f $2 .
docker tag ${REPOSITORY_NAME} "${ECR_FULL}"
# Push!
# NOTE: an IAM policy similar to one in `iam-policy.json` is required!
docker push "${ECR_FULL}"
# Log out
docker logout "${ECR_BASE}"
echo "Image is available at ${ECR_FULL}"

## Create script file
This is the script that we will be executing from our container. The script can be in any language provided that the docker image supports it. Amazon SageMaker will load all of the data into the `opt/ml/processing/input` directory. Upon completion, SageMaker will export the data from `/opt/ml/processing/output` to S3.

In [None]:
script_name = 'script'
script_path = f'{directory}/{script_name}'

In [None]:
%%writefile $script_path

import numpy
import math
import csv
import pathlib
import os
from stl import mesh

INPUT_DIR =  '/opt/ml/processing/input'
OUTPUT_DIR = '/opt/ml/processing/output'

def main():
    input_dir = pathlib.Path(INPUT_DIR)
    file_name = os.listdir(input_dir)[-1]
    print(os.listdir(input_dir))
    print(f'Reading file {input_dir / file_name}', flush=True)

    your_mesh = mesh.Mesh.from_file(input_dir / file_name)
    output_dir = pathlib.Path(OUTPUT_DIR)
    output_file = file_name.replace('.stl',f'_converted_from_stl.csv')
    with open(output_dir / output_file,'w') as f1:
        writer=csv.writer(f1, delimiter=',',lineterminator='\n',)
        writer.writerow(['x','y','z'])
        for i in your_mesh.points:
            for j in range(0, len(i), 3):
                row = [i[j],i[j+1],i[j+2]]
                writer.writerow(row)

if __name__ == "__main__":
    main()

## Create dockerfile
This dockerfile will create the execution environment for our script. The script should be placed in the `/opt/ml/code/` directory and the `ENTRYPOINT` environment variable should have the proper command to execute the script.

In [None]:
dockerfile_name = 'Dockerfile'
dockerfile_path = f'{directory}/{dockerfile_name}'

In [None]:
%%writefile $dockerfile_path

FROM python:3.8-slim-buster

RUN pip3 uninstall -y stl
RUN pip3 uninstall -y numpy-stl
RUN pip3 install -U numpy-stl
RUN pip3 uninstall -y pathlib
RUN pip3 install -U pathlib

ENV PYTHONUNBUFFERED=1
COPY script /opt/ml/code/script

ENTRYPOINT ["python3", "/opt/ml/code/script"]

## Build docker image and push it to ECR
This script has three functions. It will create a repository if one doesn't already exist for our new docker image. It will then build the image using our dockerfile. Lastly, it will push our image into Amazon ECR so that we can utilize our code in the pipeline.

In [None]:
! cd $directory ; sh ./publish-ecr.sh $algorithm $dockerfile_name

## Test run the container from SageMaker
Through SageMaker, we can create a processing job to run our container. The instance type can be set to `local` to run on the notebook instance.

In [None]:
project = 'aws'
preprocess_instance_type = 'ml.m5.large'
image_uri = f'{account}.dkr.ecr.{region}-1.amazonaws.com/sagemaker-{algorithm}:latest'

In [None]:
processor = Processor(
    role=role,
    image_uri=image_uri,
    instance_count=1,
    instance_type=preprocess_instance_type,
    base_job_name=project,
    sagemaker_session=session,
    volume_size_in_gb = 64,
)

In [None]:
processor.run(
    inputs=[
        ProcessingInput(
            input_name='input',
            source=f's3://{default_bucket}/{project}/{testfile}',
            destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(
            output_name='output',
            source='/opt/ml/processing/output',
            destination=f's3://{default_bucket}/{project}/output/{testfile}'
        )
    ],
    arguments=[],
    wait=True,
)