#### Prerequisites

In [None]:
%%capture 

!pip install sagemaker==2.100.0
!pip install jedi==0.17  # This is a requirement for pygmentize to work

#### Imports 

In [None]:
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.processing import ProcessingOutput
from sagemaker.processing import ProcessingInput
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [None]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [None]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

#### Essentials 

In [None]:
session = Session()
S3_BUCKET = session.default_bucket()
ROLE = get_execution_role()

S3_INPUT = 'data/covid_articles.txt'
S3_OUTPUT = 'data/bert/processed'
INSTANCE_TYPE = 'ml.g4dn.xlarge'
INSTANCE_COUNT = 1
PYTHON_VERSION = '1.6.0'
TRANSFORMERS_VERSION = '4.4.2'
BASE_JOB_NAME = 'hf-sm-mlm-oob-tokenize'
SOURCE_DIR = './src'
CODE = 'preprocess_mlm_oob.py'

In [None]:
logger.info(f'S3 bucket = {S3_BUCKET}')

#### View processing script

In [None]:
!pygmentize -v ./src/preprocess_mlm_oob.py

#### Create HuggingFace Processor

In [None]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_type=INSTANCE_TYPE, 
                                 instance_count=INSTANCE_COUNT, 
                                 transformers_version=TRANSFORMERS_VERSION, 
                                 pytorch_version=PYTHON_VERSION, 
                                 base_job_name=BASE_JOB_NAME)

#### Run SageMaker Processing Job

In [None]:
%%time

processor.run(code=CODE, 
              source_dir=SOURCE_DIR,
              inputs=[ProcessingInput(input_name='articles', 
                                      source=f's3://{S3_BUCKET}/{S3_INPUT}', 
                                      destination='/opt/ml/processing/input/data')],
              outputs=[ProcessingOutput(output_name='processed-data-oob', 
                                        source='/opt/ml/processing/output', 
                                        destination=f's3://{S3_BUCKET}/{S3_OUTPUT}')], 
              wait=False)