#### Prerequisites

In [2]:
%%capture 

!pip install sagemake==2.100.0
!pip install jedi==0.17  # This is a requirement for pygmentize to work

#### Imports 

In [3]:
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.processing import ProcessingOutput
from sagemaker.processing import ProcessingInput
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [5]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using SageMaker version: 2.100.0]


#### Essentials 

In [6]:
session = Session()
S3_BUCKET = session.default_bucket()
logger.info(f'S3 bucket = {S3_BUCKET}')
ROLE = get_execution_role()

S3_INPUT = 'data/covid_articles.txt'
S3_OUTPUT = 'data/processed'
INSTANCE_TYPE = 'ml.g4dn.xlarge'
INSTANCE_COUNT = 1
PYTHON_VERSION = '1.6.0'
TRANSFORMERS_VERSION = '4.4.2'
BASE_JOB_NAME = 'hf-sagemaker-processor'
SOURCE_DIR = './src'
CODE = 'preprocess_data.py'

S3 bucket = sagemaker-us-east-1-119174016168


#### View processing script

In [7]:
!pygmentize -v ./src/preprocess_data.py

[34mfrom[39;49;00m [04m[36mtokenizers[39;49;00m [34mimport[39;49;00m BertWordPieceTokenizer
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path
[34mimport[39;49;00m [04m[36mtransformers[39;49;00m 
[34mimport[39;49;00m [04m[36mtokenizers[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

[37m# Setup logging[39;49;00m
logger = logging.getLogger([31m__name__[39;49;00m)
logging.basicConfig(level=logging.getLevelName([33m'[39;49;00m[33mINFO[39;49;00m[33m'[39;49;00m), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    [36mformat[39;49;00m=[33m'[39;49;00m[33m%(asctime)s[39;49;00m[33m - [39;49;00m[33m%(name)s[39;49;00m[33m - [39;49;00m[33m%(levelname)s[39;49;00m[33m - [39;49;00m[33m%(message)s[39;49;00m[33m'[39;49;00m)

[37m# Log versions of dependencies[39;49;00m
logger.info([