### Use `original` BERT to take `fill mask` test without fine-tuning it on our COVID articles

#### Prerequisites

In [None]:
%%capture 

!pip install transformers==4.17.0

#### Imports 

In [16]:
from transformers import BertTokenizerFast
import pandas as pd
from transformers import BertForMaskedLM
from transformers import BertConfig
from transformers import pipeline
import transformers 
import logging

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [4]:
logger.info(f'[Using transformers: {transformers.__version__}]')

[Using transformers: 4.17.0]


#### Essentials

In [5]:
config = BertConfig()
default_model = BertForMaskedLM(config=config)
default_model.num_parameters()

109514298

In [9]:
%%capture

!wget -q https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O ./vocab/vocab.txt

In [10]:
config = BertConfig()

In [11]:
default_tokenizer = BertTokenizerFast.from_pretrained('./vocab', config=config)
default_tokenizer.model_max_length = 512
default_tokenizer.init_kwargs['model_max_length'] = 512
default_tokenizer

PreTrainedTokenizerFast(name_or_path='./vocab', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [12]:
MASK_TOKEN = default_tokenizer.mask_token
MASK_TOKEN

'[MASK]'

In [13]:
default_tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [14]:
fill_mask = pipeline('fill-mask', model=default_model, tokenizer=default_tokenizer)

In [21]:
df = pd.read_csv('./data/eval_mlm.csv')

for gt, masked_sentence in zip(df.ground_truth.tolist(), df.masked.tolist()):
    print(f'Ground Truth    : {gt}')
    print(f'Masked sentence : {masked_sentence}')
    predictions = fill_mask(masked_sentence, top_k=3)
    for i, prediction in enumerate(predictions):
        print(f'Rank: {i+1} | {(prediction["score"] * 100):.2f} % | {[prediction["token_str"]]}')
    print('-' * 100)

Ground Truth    : A number of firms have been reassessing spending plans in light of the covid-19 outbreak and reduced oil price.
Masked sentence : A number of firms have been reassessing spending plans in light of the covid-19 [MASK] and reduced oil price.
Rank: 1 | 0.03 % | ['lava']
Rank: 2 | 0.02 % | ['frantic']
Rank: 3 | 0.02 % | ['edit']
----------------------------------------------------------------------------------------------------
Ground Truth    : Globally, airlines are closing down and the covid-19 coronavirus has accelerated some of these closures.
Masked sentence : Globally, airlines are closing down and the covid-19 [MASK] has accelerated some of these closures.
Rank: 1 | 0.03 % | ['##aries']
Rank: 2 | 0.03 % | ['credibility']
Rank: 3 | 0.02 % | ['believers']
----------------------------------------------------------------------------------------------------
Ground Truth    : The first human challenge trial for covid has given some helpful insights into how infection pr