# Attention Mask in BERT

The attention_mask tensor indicates which tokens should be attended to (1) and which tokens are padding (0) and should be ignored. This is 
particularly important when dealing with batches of sequences of varying lengths.

Suppose we have two sentences of different lengths, and we want to process them in a batch. We use padding to make them the same length, and the 
attention_mask will indicate which tokens are real and which are padding.


In [25]:
from transformers import BertTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [26]:
Text_sequences = ["The children enjoyed a picnic by the river bank, watching ducks swim by.", "They relaxed as the sun set."]
print("Orginal text sequences batch:\n", Text_sequences)

Orginal text sequences batch:
 ['The children enjoyed a picnic by the river bank, watching ducks swim by.', 'They relaxed as the sun set.']


In [27]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token not in string.punctuation]
    return ' '.join(tokens)

preprocessed_Text_sequences = [(preprocess(text_sequence)) for text_sequence in Text_sequences]
print("Preprocessed text sequences batch:\n", preprocessed_Text_sequences)

Preprocessed text sequences batch:
 ['child enjoyed picnic river bank watching duck swim', 'relaxed sun set']


In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [29]:
def generate_embedding(preprocessed_Text_sequences):
    inputs = tokenizer(preprocessed_Text_sequences,  padding=True, return_tensors='pt')
    bert_tokenized_text = [tokenizer.convert_ids_to_tokens(ids) for ids in inputs['input_ids']]
    print("BERT tokens for text sequences batch:\n", bert_tokenized_text)
    print("BERT attention_mask for text sequences batch:\n", inputs['attention_mask'])

In [30]:
individual_word_embeddings_for_batch_of_sentences = generate_embedding(preprocessed_Text_sequences)

BERT tokens for text sequences batch:
 [['[CLS]', 'child', 'enjoyed', 'picnic', 'river', 'bank', 'watching', 'duck', 'swim', '[SEP]'], ['[CLS]', 'relaxed', 'sun', 'set', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']]
BERT attention_mask for text sequences batch:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
