### Set SageMaker version for Local Mode for Processing

The cells below will install specific versions of the SageMaker SDK. Pick jost one, run it once and then comment it out.

#### Dev version 2.9 (Processing Local Mode Support)

In [2]:
import sys
import IPython
dist_version = '2.9.2.dev0'

!aws s3 cp s3://gianpo-public/sagemaker-{dist_version}.tar.gz .
!{sys.executable} -m pip install -q -U pip
!{sys.executable} -m pip install -q sagemaker-{dist_version}.tar.gz
IPython.Application.instance().kernel.do_shutdown(True)

download: s3://gianpo-public/sagemaker-2.9.2.dev0.tar.gz to ./sagemaker-2.9.2.dev0.tar.gz


{'status': 'ok', 'restart': True}

#### Latest release

In [None]:
#!pip install -U sagemaker
#import IPython
#IPython.Application.instance().kernel.do_shutdown(True)

#### Latest 1.x Release

In [None]:
#!pip install -U sagemaker==1.72.1
#import IPython
#IPython.Application.instance().kernel.do_shutdown(True)

# Data Processing Job Creationg and Execution

## Initialization scripts

In [1]:
#import dask.dataframe as dd
#import pandas as pd
#import numpy as np
#import cv2
import os

from pathlib import Path

In [2]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name

ecr_repository = 'dask_processing'
tag = 'latest'
URI_SUFFIX = 'amazonaws.com'
dask_repository_uri = f'{account_id}.dkr.ecr.{region}.{URI_SUFFIX}/{ecr_repository}:{tag}'
print(dask_repository_uri)
root_path = Path('/home/ec2-user/SageMaker/defect_detection/')
code_path = root_path / "notebooks/WM-811K/src/"
code_path.mkdir(exist_ok=True)
data_path = root_path / "data/MIR-WM811K/"

160951647621.dkr.ecr.us-east-1.amazonaws.com/dask_processing:latest


In [3]:
root_path = Path('/home/ec2-user/SageMaker/defect_detection/')#.resolve()

code_path = root_path / "notebooks/WM-811K/src/"
code_path.mkdir(exist_ok=True)
data_path = root_path / "data/MIR-WM811K/"

## Creating SageMaker Processing Job

### Build a Container for Dask Processing

Create a container for processing with Dask. The code below is based on [this example](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/feature_transformation_with_sagemaker_processing_dask/feature_transformation_with_sagemaker_processing_dask.ipynb).

#### Docker Build

In [None]:
%%sh 
pushd src/data_processing
docker build -t  wafer-data-processing .
popd

#### Push to ECR

In [None]:
# Create ECR repository and push docker image

!$(aws ecr get-login --region $region --registry-ids $account_id --no-include-email)
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $dask_repository_uri
!docker push $dask_repository_uri

### Create Script

The script has been prepared on an editor, and can be found at `notebooks/WM-811K/src/data_processing.py`. It's made from parts of the original notebook, with imports resolved and a bit of refactoring for code clarity.

In [None]:
!pygmentize ~/SageMaker/defect_detection/notebooks/WM-811K/src/data_processing.py

## Run the Processing Job

In [None]:
import pandas as pd

df = pd.read_csv('../../data/MIR-WM811K/wafers.csv.gz')
df.head()

### Set up the Script Processor

We are using a Dask image for this. It takes the URI of the container image and the script we want to run.

In [46]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

region = boto3.session.Session().region_name

role = get_execution_role()

dask_processor = ScriptProcessor(
    base_job_name="wafer-data-processing",
    image_uri=dask_repository_uri,
    command=["/opt/program/bootstrap.py"],
    volume_size_in_gb=5,
    role=role,
    instance_count=4,
    instance_type="local",
    max_runtime_in_seconds=60*20,
)

### Run

In [47]:
dask_processor.run(
    code=str(code_path / 'data_processing.py'),
    inputs=[ProcessingInput(
        source="s3://sagemaker-us-east-1-160951647621/wafer-input/wafers.pkl.gz",
        destination='/opt/ml/processing/input'
    )],
    outputs=[ProcessingOutput(output_name='autoencoder/train', source='/opt/ml/processing/train')]
)


Job Name:  wafer-data-processing-2020-10-04-21-48-20-207
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/wafer-input/wafers.pkl.gz', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/wafer-data-processing-2020-10-04-21-48-20-207/input/code/data_processing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'autoencoder/train', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/wafer-data-processing-2020-10-04-21-48-20-207/output/autoencoder/train', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}]
Creating i3moqvvhm5-algo-3-hoekd ... 
Creating jbd2pijv7l-algo-4-hoekd

In [48]:
processed_data = dask_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']

In [49]:
bucket, *path = processed_data.split("/")[2:]
path = "/".join(path)
print(bucket, path)

sagemaker-us-east-1-160951647621 wafer-data-processing-2020-10-04-21-48-20-207/output/autoencoder/train


In [50]:
sagemaker.utils.download_file(bucket, path + "/data.npz", "/tmp/data.npz", sagemaker.session.Session())

In [51]:
import numpy as np

with np.load("/tmp/data.npz", allow_pickle=True) as data:
    x = data['x']
    y = data['y']
    label_classes = data['label_classes'].item(0)

In [52]:
print(x.shape, y.shape, label_classes)

(22894, 26, 26, 3) (22894,) {'Center': 0, 'Donut': 1, 'Edge-Loc': 2, 'Edge-Ring': 3, 'Loc': 4, 'Near-full': 5, 'Random': 6, 'Scratch': 7, 'none': 8}
