#### Prerequisites 

In [None]:
%%capture 

!pip install sagemake==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

#### Imports 

In [2]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker import get_execution_role
import sagemaker
import logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using SageMaker version: 2.100.0]


#### Essentials 

In [5]:
ROLE = get_execution_role()
BUCKET = 'sagemaker-us-east-1-119174016168'
S3_INPUT_PATH = 'corpus'
S3_OUTPUT_PATH = 'tokenizer'

#### View processing script

In [7]:
!pygmentize -v ./src/extract_custom_vocabulary.py

[34mfrom[39;49;00m [04m[36mtokenizers[39;49;00m [34mimport[39;49;00m BertWordPieceTokenizer
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path
[34mimport[39;49;00m [04m[36mtransformers[39;49;00m 
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

logger = logging.getLogger([33m'[39;49;00m[33msagemaker[39;49;00m[33m'[39;49;00m)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

logging.info([33mf[39;49;00m[33m'[39;49;00m[33m[Using transformers: [39;49;00m[33m{[39;49;00mtransformers.__version__[33m}[39;49;00m[33m][39;49;00m[33m'[39;49;00m)

corpus_path = [33m'[39;49;00m[33m/opt/ml/processing/input[39;49;00m[33m'[39;49;00m

paths = [[36mstr[39;49;00m(x) [34mfor[39;49;00m x [35min[39;49;00m Path(corpus_path).glob([33m'[39;49;00m[33m*.txt[39;49;00m[33m

#### Create HuggingFace Processor

In [None]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_count=1, 
                                 instance_type='ml.g4dn.xlarge', 
                                 transformers_version='4.4.2', 
                                 pytorch_version='1.6.0', 
                                 base_job_name='hf-processor')

#### Run SageMaker Processing Job

In [None]:
processor.run(code='extract_custom_vocabulary.py', 
              source_dir='src',
              inputs=[ProcessingInput(input_name='data', 
                                      source=f's3://{BUCKET}/{S3_INPUT_PATH}', 
                                      destination='/opt/ml/processing/input')],
              outputs=[ProcessingOutput(output_name='tokenizer', source='/opt/ml/processing/output/tokenizer', 
                                        destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')])