# <B> Preprocessing </B>
* Container: codna_pytorch_py39

## AutoReload

In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import boto3

## 1. Processing-job for preprocessing

In [35]:
import os
import wget
import sagemaker
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor

## 2. parameter store 설정

In [36]:
from utils.ssm import parameter_store
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
prefix = pm.get_params(key="PREFIX")

* params for processing job

In [45]:
local_mode = True

if local_mode: 
    instance_type = 'local'
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    data_path = os.path.join(os.getcwd(), "data")
    
else:
    instance_type = "ml.m5.xlarge" ## "ml.g4dn.xlarge"
    sagemaker_session = sagemaker.Session()
    data_path = pm.get_params(key=prefix + '-S3-DATA-PATH')
    
print (f"instance-type: {instance_type}")
print (f"image-uri: {pm.get_params(key=''.join([prefix, '-IMAGE-URI']))}")
print (f"role: {pm.get_params(key=prefix + '-SAGEMAKER-ROLE-ARN')}")
print (f"bucket: {pm.get_params(key=prefix + '-BUCKET')}")
print (f"dataset-path: {data_path}")
print (f"sagemaker_session: {sagemaker_session}")

instance-type: local
image-uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-test-training
role: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436
bucket: sm-nemo-bucket
dataset-path: /home/ec2-user/SageMaker/nemo-on-sagemaker/1 building-component/data
sagemaker_session: <sagemaker.local.local_session.LocalSession object at 0x7f788bc99be0>


* Define processing job

In [46]:
dataset_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version=None,
    image_uri=,
    instance_type=,
    instance_count=,
    role=
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    sagemaker_session=sagemaker_session
)

proc_prefix = "/opt/ml/processing"

output_path = os.path.join(
    "s3://{}".format(pm.get_params(key=prefix + "-BUCKET")),
    prefix,
    "preprocessing",
    "data"
)

In [47]:
output_path

's3://sm-nemo-bucket/nemo-asr/preprocessing/data'

In [48]:
dataset_processor.run(
    code=
    source_dir=
    inputs=[
        ProcessingInput(
            input_name="input-data",
            source=,
            destination=
        ),
    ],
    outputs=[       
        ProcessingOutput(
            output_name="output-data",
            source=,
            destination=
        ),
    ],
    arguments=["--proc_prefix", proc_prefix, \
               "--train_mount_dir", "/opt/ml/input/data/training/", \
               "--test_mount_dir", "/opt/ml/input/data/testing/"],
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.processing:Uploaded /home/ec2-user/SageMaker/nemo-on-sagemaker/1 building-component/code to s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-03-21-06-27-10-936/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-03-21-06-27-10-936/source/runproc.sh
INFO:sagemaker:Creating processing-job with name preprocessing-2023-03-21-06-27-10-936
INFO:sagemaker.local.local_session:Starting processing job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-b8cfb:
    container_name: h1yhfflnje-algo-1-b8cfb
    entryp

Creating h1yhfflnje-algo-1-b8cfb ... 
Creating h1yhfflnje-algo-1-b8cfb ... done
Attaching to h1yhfflnje-algo-1-b8cfb
[36mh1yhfflnje-algo-1-b8cfb |[0m Received arguments Namespace(proc_prefix='/opt/ml/processing', train_mount_dir='/opt/ml/input/data/training/', test_mount_dir='/opt/ml/input/data/testing/')
[36mh1yhfflnje-algo-1-b8cfb |[0m Converting .sph to .wav...
[36mh1yhfflnje-algo-1-b8cfb |[0m Finished conversion.
[36mh1yhfflnje-algo-1-b8cfb |[0m ******
[36mh1yhfflnje-algo-1-b8cfb |[0m ******
[36mh1yhfflnje-algo-1-b8cfb |[0m Training manifest created.
[36mh1yhfflnje-algo-1-b8cfb |[0m Test manifest created.
[36mh1yhfflnje-algo-1-b8cfb |[0m ***Done***
[36mh1yhfflnje-algo-1-b8cfb |[0m data_dir ['code', 'entrypoint', 'an4']
[36mh1yhfflnje-algo-1-b8cfb |[0m self.output_dir ['an4']
[36mh1yhfflnje-algo-1-b8cfb exited with code 0
[0mAborting on container exit...




===== Job Complete =====


In [51]:
!aws s3 sync $output_path ./data/preprocessing --quiet
output_path

's3://sm-nemo-bucket/nemo-asr/preprocessing/data'

## 3. parameter store에 Processing output 추가

In [50]:
pm.put_params(key="-".join([prefix, "PREP-DATA-PATH"]), value=output_path, overwrite=True)

'Store suceess'