#### Prerequisites 

In [None]:
%%capture 

!pip install sagemake==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

#### Imports 

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker import get_execution_role
import sagemaker
import logging

In [None]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [None]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

#### Essentials 

In [None]:
ROLE = get_execution_role()
BUCKET = 'sagemaker-us-east-1-119174016168'
S3_INPUT_PATH = 'corpus'
S3_OUTPUT_PATH = 'tokenizer'

#### View processing script

In [None]:
!pygmentize -v ./src/extract_custom_vocabulary.py

#### Create HuggingFace Processor

In [None]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_count=1, 
                                 instance_type='ml.g4dn.xlarge', 
                                 transformers_version='4.4.2', 
                                 pytorch_version='1.6.0', 
                                 base_job_name='hf-processor')

#### Run SageMaker Processing Job

In [None]:
processor.run(code='extract_custom_vocabulary.py', 
              source_dir='src',
              inputs=[ProcessingInput(input_name='data', 
                                      source=f's3://{BUCKET}/{S3_INPUT_PATH}', 
                                      destination='/opt/ml/processing/input')],
              outputs=[ProcessingOutput(output_name='tokenizer', source='/opt/ml/processing/output/tokenizer', 
                                        destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')])