### Use `original` BERT to take `fill mask` test without fine-tuning it on our COVID articles

#### Prerequisites

In [2]:
%%capture 

!pip install transformers==4.17.0
!pip install pandas==1.1.5

#### Imports 

In [3]:
from transformers import BertTokenizerFast
from transformers import BertForMaskedLM
from transformers import BertConfig
from transformers import pipeline
import transformers 
import pandas as pd
import pandas
import logging

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [5]:
logger.info(f'[Using transformers: {transformers.__version__}]')
logger.info(f'[Using pandas: {pd.__version__}]')

[Using transformers: 4.17.0]
[Using pandas: 1.1.5]


#### Essentials

In [6]:
config = BertConfig()

Total number of parameters = 109514298


##### Download vocab for original BERT base uncased to local

In [None]:
%%capture

!wget -q https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O ./vocab/vocab.txt

#### Re-create BERT MLM 

In [None]:
default_model = BertForMaskedLM(config=config)
logger.info(f'Total number of parameters = {default_model.num_parameters()}')

#### Re-create default BERT tokenizer 

In [None]:
default_tokenizer = BertTokenizerFast.from_pretrained('./vocab', config=config)
default_tokenizer.model_max_length = 512
default_tokenizer.init_kwargs['model_max_length'] = 512
default_tokenizer

In [None]:
##### Verify tokenizer

In [None]:
MASK_TOKEN = default_tokenizer.mask_token
MASK_TOKEN

In [None]:
default_tokenizer.special_tokens_map

#### Create HuggingFace Pipeline for `fill mask` task

In [None]:
fill_mask = pipeline('fill-mask', model=default_model, tokenizer=default_tokenizer)

#### Test original BERT MLM for `fill mask` task

In [None]:
df = pd.read_csv('./data/eval_mlm.csv')

for gt, masked_sentence in zip(df.ground_truth.tolist(), df.masked.tolist()):
    print(f'Ground Truth    : {gt}')
    print(f'Masked sentence : {masked_sentence}')
    predictions = fill_mask(masked_sentence, top_k=3)
    for i, prediction in enumerate(predictions):
        print(f'Rank: {i+1} | {(prediction["score"] * 100):.2f} % | {[prediction["token_str"]]}')
    print('-' * 100)