## `Fine-tune` original BERT with our COVID articles and take the `fill mask` test 

#### Prerequisites

In [2]:
%%capture 

!pip install sagemaker==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

#### Imports 

In [3]:
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [5]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')

[Using SageMaker: 2.100.0]


#### Essentials 

In [6]:
session = Session()
ROLE = get_execution_role()
S3_BUCKET = session.default_bucket()
logger.info(f'S3 bucket = {S3_BUCKET}')

ENTRY_POINT = 'fine_tune.py'
SOURCE_DIR = './src'
INSTANCE_TYPE = 'ml.p4d.24xlarge'
INSTANCE_COUNT = 4
EBS_VOLUME_SIZE = 1024
TRANSFORMERS_VERSION = '4.17.0'
PYTORCH_VERSION = '1.10.2'
PYTHON_VERSION = 'py38'

S3 bucket = sagemaker-us-east-1-119174016168


#### Download vocabulary for original BERT base uncased to local `vocab` directory

In [7]:
%%capture

!wget -q https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O ./vocab/vocab.txt

##### Copy vocabulary file from local `vocab` directory to S3 

In [8]:
!aws s3 cp ./vocab/vocab.txt s3://{S3_BUCKET}/data/bert/vocab/vocab.txt

upload: vocab/vocab.txt to s3://sagemaker-us-east-1-119174016168/data/bert/vocab/vocab.txt


#### View training script 

In [9]:
!pygmentize ./src/fine_tune.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m DataCollatorForLanguageModeling
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TrainingArguments
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertTokenizerFast
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertForMaskedLM
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertConfig
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m pipeline 
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_dataset
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m Trainer
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m DatasetDict
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;

#### Create the estimator 

* Documentation on SageMaker HuggingFace Estimator can be found [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html)

In [10]:
DATA = {'train': f's3://{S3_BUCKET}/data'}

In [11]:
MAX_LENGTH = 512  # Context size for BERT tokenizer 
CHUNK_SIZE = 128  
TRAIN_EPOCHS = 40
BATCH_SIZE = 32

In [12]:
HYPERPARAMETERS = {'s3_bucket': S3_BUCKET, 
                   'max_len': MAX_LENGTH,
                   'chunk_size': CHUNK_SIZE,
                   'num_train_epochs': TRAIN_EPOCHS, 
                   'per_device_train_batch_size': BATCH_SIZE}

In [13]:
DISTRIBUTION_STRATEGY = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

#### Create HuggingFace estimator needed for the training job

In [14]:
huggingface_estimator = HuggingFace(entry_point=ENTRY_POINT, 
                                    source_dir=SOURCE_DIR, 
                                    role=ROLE, 
                                    instance_type=INSTANCE_TYPE, 
                                    instance_count=INSTANCE_COUNT,
                                    volume_size=EBS_VOLUME_SIZE,
                                    hyperparameters=HYPERPARAMETERS,
                                    distribution=DISTRIBUTION_STRATEGY,
                                    transformers_version=TRANSFORMERS_VERSION, 
                                    pytorch_version=PYTORCH_VERSION, 
                                    py_version=PYTHON_VERSION, 
                                    disable_profiler=True,
                                    debugger_hook_config=False)

#### Kick-off the fine-tuning Job

In [15]:
huggingface_estimator.fit(DATA, wait=False)

Creating training-job with name: huggingface-pytorch-training-2022-08-26-23-22-45-230
train request: {
    "AlgorithmSpecification": {
        "TrainingInputMode": "File",
        "TrainingImage": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04",
        "EnableSageMakerMetricsTimeSeries": true
    },
    "OutputDataConfig": {
        "S3OutputPath": "s3://sagemaker-us-east-1-119174016168/"
    },
    "TrainingJobName": "huggingface-pytorch-training-2022-08-26-23-22-45-230",
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 86400
    },
    "ResourceConfig": {
        "VolumeSizeInGB": 1024,
        "InstanceCount": 4,
        "InstanceType": "ml.p4d.24xlarge"
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628",
    "InputDataConfig": [
        {
            "DataSource": {
                "S3DataSource": {
                    "S3DataType