In [1]:
#import dask.dataframe as dd
#import pandas as pd
#import numpy as np
#import cv2
import os

from pathlib import Path

In [15]:
root_path = Path('/home/ec2-user/SageMaker/defect_detection/')#.resolve()

code_path = root_path / "notebooks/WM-811K/src/"
code_path.mkdir(exist_ok=True)
data_path = root_path / "data/MIR-WM811K/"

## Creating SageMaker Processing Job

## Create the execution script

In [44]:
%%writefile src/data_augmentation/program/augmentation.py
import numpy as np
import tarfile
import argparse
import logging
from pathlib import Path
from tensorflow.keras.models import load_model


def parse_arguments():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--limit", type=int, default=None)
    parser.add_argument("--augmented-size", type=int, default=2000)
    return parser.parse_args()


def load_models(model_path: str):
    # Add tar decompression here
    model_tar = Path(model_path) / "model.tar.gz"
    tf_file = tarfile.open(str(model_tar), mode="r:gz")
    dest_dir = Path("/tmp/models")
    dest_dir.mkdir()
    tf_file.extractall(path=str(dest_dir))
    tf_file.close()
    encoder = load_model(Path(dest_dir) / "encoder.h5")
    decoder = load_model(Path(dest_dir) / "decoder.h5")
    return encoder, decoder


def load_data(path: str, file_name: str="data.npz", limit: int=None):
    file_path = Path(path) / file_name
    with np.load(str(file_path), allow_pickle=True) as data:
        x = data['x']
        y = data['y']
        label_classes = data['label_classes'].item(0)
    if limit:
        return (x[:limit], y[:limit], label_classes)
    else:
        return (x, y, label_classes)
    
    
def generate_augmented_data(wafers, label, encoder, decoder, n_examples):
    # Encode input wafer
    logging.info(f"There are {len(wafers)} examples for {label}")
    encoded_x = encoder.predict(wafers)
    
    additional_example_batches = n_examples // wafer.shape[0] + 1
    temp_noised = []
    for i in range(additional_example_batches):
        temp_noised.append(encoded_x + np.random.normal(loc=0, scale=0.1, size = (len(encoded_x), 13, 13, 64)))
    noised_encoded_x = np.vstack(temp_noised)
    gen_x = decoder.predict(noised_encoded_x[1:])
    # also make label vector with same length
    gen_y = np.full((len(gen_x), ), label)

    logging.info(f"Returning {n_examples - len(wafer)} new examples") 
    return gen_x[1:n_examples+1], gen_y[1:n_examples+1]


def augment(x, y, labels, encoder, decoder, n_examples):
    aug_x = x.copy()
    aug_y = y.copy()
    for l in labels: 
        # skip none case
        if l in {'none', 'unknown'} : 
            continue
        else:
            logging.info(f'Generating {l}')

        gen_x, gen_y = generate_augmented_data(x[np.where(y==l)[0]], l, encoder, decoder, n_examples)
        aug_x = np.concatenate((aug_x, gen_x), axis=0)
        aug_y = np.concatenate((aug_y, gen_y))
    return aug_x, aug_y


def save_augmented(x, y, output_path):
    np.savez_compressed(output_path / "data.npz", x=x, y=y)

                        
if __name__ == "__main__":
    args = parse_arguments()
    root_path = Path('/opt/ml/processing')
    model_path = root_path / "models"
    data_path = root_path / "data"
    augmented_path = root_path / "augmented"
    x, y, label_classes = load_data(str(data_path), limit=args.limit)
    encoder, decoder = load_models(str(model_path))
    x, y = generate_augmentation(x, y, list(label_classes.keys()), encoder, decoder, args.augmented_size)
    save_augmented(x, y, augmented_path)


Overwriting src/data_augmentation/program/augmentation.py


### Build a Container for augmentation

In [45]:
import boto3


account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name
ecr_repository = 'data-augmentation'
tag = ':latest'
uri_suffix = 'amazonaws.com'
repository_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region, uri_suffix, ecr_repository + tag)

In [46]:
%%sh 
pushd src/data_augmentation
$(aws ecr get-login --region $region --registry-ids $account_id --no-include-email)
docker build -t  data-augmentation .
popd

~/SageMaker/defect_detection/notebooks/WM-811K/src/data_augmentation ~/SageMaker/defect_detection/notebooks/WM-811K
Sending build context to Docker daemon  13.82kB
Step 1/15 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.1.0-gpu-py3
 ---> 43a74e93a483
Step 2/15 : RUN apt-get update
 ---> Using cache
 ---> 0e0a4fe719b7
Step 3/15 : RUN apt-get install -y curl unzip python3 python3-setuptools python3-pip python-dev python3-dev python-psutil ffmpeg libsm6 libxext6
 ---> Using cache
 ---> bd665a670110
Step 4/15 : RUN pip3 install py4j psutil==5.6.5 numpy==1.17.4
 ---> Using cache
 ---> a12df01f221b
Step 5/15 : RUN apt-get clean
 ---> Using cache
 ---> de8e4caee4ab
Step 6/15 : RUN rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> 7ab1fdfd2da8
Step 7/15 : ENV PYTHONHASHSEED 0
 ---> Using cache
 ---> 0644a25711e6
Step 8/15 : ENV PYTHONIOENCODING UTF-8
 ---> Using cache
 ---> ddc2111be8d3
Step 9/15 : ENV PIP_DISABLE_PIP_VERSION_CHECK 1
 ---> Using cache
 ---> 3768ac

Note: AWS CLI version 2, the latest major version of the AWS CLI, is now stable and recommended for general use. For more information, see the AWS CLI version 2 installation instructions at: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html

usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help
aws: error: argument --region: expected one argument


In [47]:
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $repository_uri
!docker push $repository_uri


An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'data-augmentation' already exists in the registry with id '160951647621'
The push refers to repository [160951647621.dkr.ecr.us-east-1.amazonaws.com/data-augmentation]

[1B48bf6f13: Preparing 
[1B4c23698f: Preparing 
[1Bb8edb1eb: Preparing 
[1Bd3858bc6: Preparing 
[1Bbd70a43c: Preparing 
[1Bb267bb8a: Preparing 
[1B7470f0dd: Preparing 
[1Bec6e212a: Preparing 
[1Bf319a508: Preparing 
[1Bf7132110: Preparing 
[1Beb9ebda6: Preparing 
[7Bb267bb8a: Waiting g 
[7B7470f0dd: Waiting g 
[7Bec6e212a: Waiting g 
[1B62cacce5: Preparing 
[1Bd22b16ab: Preparing 
[1B26dec4ac: Preparing 
[1B6ff78197: Preparing 
[1Bdf5cf960: Preparing 
[7B5bf23a91: Waiting g 
[1Bb763c8de: Preparing 
[13B7132110: Waiting g 
[8Bd22b16ab: Waiting g 
[14Bb9ebda6: Waiting g 
[9B26dec4ac: Waiting g 
[15B829d3bc: Waiting g 
[15B4b15037: Waiting g 
[1Ba4b22186: Preparing 
[12Bf

In [48]:
# import sys
# import IPython
# dist_version = '2.9.2.dev0'
# !aws s3 cp s3://gianpo-public/sagemaker-{dist_version}.tar.gz .
# !{sys.executable} -m pip install -q -U pip
# !{sys.executable} -m pip install -q sagemaker-{dist_version}.tar.gz
# IPython.Application.instance().kernel.do_shutdown(True)

In [49]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor

region = boto3.session.Session().region_name

role = get_execution_role()

data_augmenter = ScriptProcessor(
    base_job_name="data-augmentation",
    image_uri=repository_uri,
    command=["python3"],
    role=role,
    instance_count=1,
    instance_type="local",
    max_runtime_in_seconds=1200,
)

In [50]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [51]:
data_augmenter.run(
    code="src/data_augmentation/program/augmentation.py",
    arguments=["--limit", "100", "--augmented-size", "15000"],
    inputs=[
        ProcessingInput(
            source="s3://sagemaker-us-east-1-160951647621/tensorflow-training-2020-10-01-16-22-10-257/model",
            destination='/opt/ml/processing/models'
        ), ProcessingInput(
            source="s3://sagemaker-us-east-1-160951647621/wafer-data-processing-2020-09-30-22-55-17-588/output/autoencoder/train",
            destination="/opt/ml/processing/data"
        )
    ],
    outputs=[ProcessingOutput(output_name='classifier/train', source='/opt/ml/processing/augmented')]
)


Job Name:  data-augmentation-2020-10-01-23-12-45-656
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/tensorflow-training-2020-10-01-16-22-10-257/model', 'LocalPath': '/opt/ml/processing/models', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/wafer-data-processing-2020-09-30-22-55-17-588/output/autoencoder/train', 'LocalPath': '/opt/ml/processing/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-160951647621/data-augmentation-2020-10-01-23-12-45-656/input/code/augmentation.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionTy

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpuedjwwrl/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1

In [27]:
processed_data = data_augmenter.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']

In [38]:
bucket, *path = processed_data.split("/")[2:]
path = "/".join(path)

In [39]:
sagemaker.utils.download_file(bucket, path + "/data.npz", "/tmp/data.npz", sagemaker.session.Session())

In [44]:
import numpy as np

with np.load("/tmp/data.npz", allow_pickle=True) as data:
    x = data['arr_0']
    y = data['arr_1']