# Inference for BERT Model
- Use FP8 emulator from INTEL for quantized models.
- Reproduce the results from INTEL example (BERT-large).
- Models: BERT-large, BERT-base, and BERT-tiny
- Dataset: SQuAD  

Reference:
1. https://github.com/IntelLabs/FP8-Emulation-Toolkit.git
2. https://huggingface.co/docs/transformers/v4.37.2/en/model_doc/bert#transformers.BertForQuestionAnswering
3. https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
4. https://github.com/bhadreshpsavani/UnderstandingNLP/blob/master/DistilbertPerformance.ipynb

## 1. Libraries

In [6]:
import numpy as np
from tqdm.notebook import tqdm
import copy

import torch

# INTEL emulator
from mpemu import mpt_emu

import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV1Processor, SquadFeatures

from datasets import load_metric, load_dataset
from huggingface_hub import list_datasets

In [7]:
# Set CPU or GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Running on {device}')

Running on cuda


In [5]:
# Check available datasets
for dataset in list_datasets():
    print(dataset.id)

acronym_identification
ade_corpus_v2
UCLNLP/adversarial_qa
aeslc
afrikaans_ner_corpus
ag_news
allenai/ai2_arc
air_dialogue
ajgt_twitter_ar
allegro_reviews
allocine
alt
amazon_polarity
amazon_reviews_multi
amazon_us_reviews
ambig_qa
nala-cub/americas_nli
ami
amttl
facebook/anli
app_reviews
aqua_rat
aquamuse
bigIR/ar_cov19
ar_res_reviews
ar_sarcasm
arabic_billion_words
arabic_pos_dialect
arabic_speech_corpus
arcd
arsentd_lev
art
arxiv_dataset
ascent_kb
aslg_pc12
asnq
facebook/asset
assin
assin2
atomic
autshumato
facebook/babi_qa
banking77
bbaw_egyptian
bbc_hindi_nli
bc2gm_corpus
beans
best2009
bianet
bible_para
big_patent
billsum
bing_coronavirus_query_set
biomrc
biosses
TheBritishLibrary/blbooks
TheBritishLibrary/blbooksgenre
blended_skill_talk
nyu-mll/blimp
blog_authorship_corpus
bn_hate_speech
bnl_newspapers
bookcorpus
bookcorpusopen
google/boolq
bprec
break_data
brwac
bsd_ja_en
bswac
c3
c4
cail2018
caner
capes
casino
catalonia_independence
cats_vs_dogs
cawac
cbt
cc100
cc_news
ccalign

## 2. Methods

- Score Calculation:

In [8]:
'''
Compute the score

IN:
prediction <-- predicted value
references <-- label
dataset <-- used dataset

OUT:
score --> resulting score
'''
def get_result(predictions, references, dataset):
    # Load the specific metric --> in this case the metric for squad dataset
    # https://huggingface.co/docs/datasets/v1.0.1/loading_metrics.html
    squad_metric = load_metric(dataset)
    
    # Compute the corresponding metric
    score = squad_metric.compute(predictions=predictions, references=references)
    
    return score

- Load validation dataset:

In [None]:
'''
Load Validation Dataset for testing
    - Squad dataset only have train and validation splits

IN:
dataset <-- dataset

OUT:
valid_dataset --> validation dataset
'''
def get_validation_data(dataset):
    # Download the corresponding dataset
    datasets = load_dataset(dataset)
    print(dataset)

    # Take the validation split
    valid_dataset = datasets['validation']
    
    return valid_dataset

- Inference:

In [None]:
"""
Inference using the Valudation Dataset

IN:
valid_dataset <-- validation dataset
model <-- transformer model
tokenizer <-- transformer tokenizer
device <-- CPU or CUDA

OUT:
predictions --> predicted values
references --> labels
"""
def get_infernece(valid_dataset, model, tokenizer, device):
    predictions=[]
    references=[]
    model.to(device)
    for example in tqdm(valid_dataset):
        inputs = tokenizer(example['question'], example['context'], return_tensors="pt", truncation=True)
        inputs.to(device)
        output = model(**inputs)
        start_index = torch.argmax(output['start_logits'])
        end_index = torch.argmax(output['end_logits'])
        ans_ids = inputs['input_ids'][0][start_index :end_index+1]
        answer = tokenizer.decode(ans_ids)
        answer = tokenizer.clean_up_tokenization(answer).strip()
        no_answer_probability = 1 if len(example['answers']['answer_start'])!=0 else 0
        """
        predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
        references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
        """
        pred = {'prediction_text':answer, 'id': example['id'], 'no_answer_probability' : no_answer_probability}
        ref = {'answers': example['answers'] , 'id': example['id']}
        predictions.append(pred)
        references.append(ref)
    return predictions, references

In [None]:
'''
Inference evaluation --> from INTEL example.

IN:
model <-- BERT model
tokenizer <-- BERT tokenizer
device <-- CPU or CUDA
batch_size <-- size of bacth for evaluation
pt_q <-- Define if quantization is required (post-training quantization)
q_dtype <-- Data type for quantization
            'E5M2', 'E4M3', 'E3M4', 'HYBRID' (IN:E4M3 and W:E3M4).

OUT:
result --> evaluation results using Squad dataset

'''
def evaluate(model, tokenizer, device, batch_size, q_dtype, pt_q=False):
    
    # Quantize the model for inference
    if pt_q:
        # Create a deep copy of the model since the function overwrites it
        model_to_q = copy.deepcopy(model)
        
        # Exempt Layers
        list_exempt_layers = []

        print(f"Preparing the model for {q_dtype} quantization")
        print("List of exempt layers : ", list_exempt_layers)
        
        model, emulator = mpt_emu.quantize_model (model_to_q, dtype=q_dtype, hw_patch='None',
                            list_exempt_layers=list_exempt_layers, 
                            device=device, verbose=True)

    # Create the Evaluation Dataloader
    

    return 

## 3. Dataset
- Download SQUAD dataset.

In [1]:
!mkdir -p SQUAD1 && cd SQUAD1 && wget --no-check-certificate https://data.deepai.org/squad1.1.zip && unzip squad1.1.zip && cd ..

--2024-02-21 12:26:39--  https://data.deepai.org/squad1.1.zip
Resolving data.deepai.org (data.deepai.org)... 84.17.38.227
Connecting to data.deepai.org (data.deepai.org)|84.17.38.227|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9152254 (8.7M) [application/zip]
Saving to: ‘squad1.1.zip’


2024-02-21 12:26:41 (5.92 MB/s) - ‘squad1.1.zip’ saved [9152254/9152254]

Archive:  squad1.1.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


## 4. Models
- Download the SQUAD fine-tunned models for inference.
    - BERT-large.
    - BERT-base.
    - BERT-tiny.  

- Tokenizer: trained layer that turns input text into tensors based on a vocabulary.
- Cased and Uncased BERT models:
    - The cased model keeps the same text in the original papers as input, including both the capitalized and lowercase words.
    - The uncased models only use the words in lowercase.

In [5]:
'''
BERT-Large:
- 24 encoder layers
- 1024 hidden dimensions
- 16 attention heads
- 336M parameters.
''' 
# INTEL example:
    # Use uncased model
    # Use whole-word-masking-finetuned-squad model
    # https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad

print('<BERT-Large>')

# Load tokenizer
# Squad Dataset is not compatible with fast tokenizers
tokenizer_l = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", use_fast=False)
print(f'Tokenizer: \n {tokenizer_l} \n')

# Load model
model_l = AutoModelForQuestionAnswering.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")
print(f'Model: \n {model_l}')

<BERT-Large>
Tokenizer: 
 BertTokenizer(name_or_path='google-bert/bert-large-uncased-whole-word-masking-finetuned-squad', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} 



Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model: 
 BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

## 5. Inference

In [None]:
# BERT-large
batch_size = 24