In [8]:
import random

import numpy as np
import pandas as pd

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

### Sentences

In [58]:
sentences = ['Computer networking may be considered a branch of electrical engineering.',
             'Computer networking is part of the electronics engineering field.',
             'Computer networking is a lot of business.',
             'Any data sent across a network requires time to travel from source to destination.',
             'The information pushed to the network needs time to go from point A to point B.',
             'The travel time of the data is instantaneous.',
             'Firewalls are typically configured to reject access requests from unrecognized sources.',
             'Firewalls are usually set up to refuse access requests from unknown sources.',
             'Firewalls allow actions from all foreign sources.',
             'Short sentence.',
             'Very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.'
]

### Load pretrained BERT model

In [59]:
# Load pretrained model/tokenizer
model_name_or_path = '/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/'  #'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir ='../../cache') # Will output all hidden_states.

### Tokenization

In [60]:
# Tokenization in multiple steps.
marked_text = ["[CLS] " + sent + " [SEP]" for sent in sentences]
tokenized_text = [tokenizer.tokenize(sent) for sent in marked_text]
indexed_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text]

# Tokenization in one step.
tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

# Example.
print("Example:\n")
random.seed(2)
print(random.choice(marked_text))
random.seed(2)
print(random.choice(tokenized_text))
random.seed(2)
print(random.choice(indexed_tokens))

Example:

[CLS] Computer networking may be considered a branch of electrical engineering. [SEP]
['[CLS]', 'Computer', 'networking', 'may', 'be', 'considered', 'a', 'branch', 'of', 'electrical', 'engineering', '.', '[SEP]']
[101, 6701, 16074, 1336, 1129, 1737, 170, 3392, 1104, 6538, 3752, 119, 102]


### Padding/Truncating

In [61]:
# Define length of longest sentence in our dataset.
lengths = [len(i) for i in tokenized]
max_len = max(lengths) if max(lengths) <= 512 else 512

# Pad each tokenized sentence according to the maximum length.
print("Padding/Truncating sentences to {} tokens...".format(max_len))
padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                      value=0, truncating="post", padding="post")

# Example.
random.seed(2)
print("Example:\n    {}".format(random.choice(padded)))

Padding/Truncating sentences to 39 tokens...
Example:
    [  101  6701 16074  1336  1129  1737   170  3392  1104  6538  3752   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]


### Masking

In [62]:
attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

# Example.
random.seed(2)
print("Example:\n    {}".format(random.choice(attention_mask)))

Example:
    [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


### Convert to torch tensors

In [63]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

### Encode

In [64]:
with torch.no_grad():
    # output is a 2-tuple where:
    #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
    #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
    #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
    output = model(input_ids, attention_mask=attention_mask)

# Get individual components of the output.
last_hidden_states = output[0]
pooler_output = output[1]
hidden_states = output[2]

# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
hidden_states = torch.stack(hidden_states, dim=0)

# Switch around the “layers” and “tokens” dimensions with permute.
hidden_states = hidden_states.permute(1,2,0,3)

# Print dimensions of output.
print("Dimensions of hidden_states: {}".format(hidden_states.size()))
print("   - Number of layers (+1 with initial token embeddings): {}".format(hidden_states.size(2)))
print("   - Number of sentences: {}".format(hidden_states.size(0)))
print("   - Number of tokens in a sentence: {}".format(hidden_states.size(1)))
print("   - Dimension of an embedding : {}".format(hidden_states.size(3)))

Dimensions of hidden_states: torch.Size([11, 39, 13, 768])
   - Number of layers (+1 with initial token embeddings): 13
   - Number of sentences: 11
   - Number of tokens in a sentence: 39
   - Dimension of an embedding : 768


### Sentence embeddings (average last layer)

In [65]:
# For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
#
# Note:
#  - In practice, batches of sentences will be encoded at the same time. The problem is that the tokenization step will add as many [PAD] token as the longest sentence in the batch.
#    If we simply compute the average of all tokens embeddings as the embedding of the sentence, the latter could be really messed up if it is a very small sentence in a batch of
#    very long sentences, as the average embedding will converge towards the one of the [PAD] token (as it is the most represented in that sequence).
#  - One solution would be to sort the sentences by length before encoding them. That way, the sentences in a batch sampled sequentially will approximately all have the same length.
#    What I don't like with this solution is that we completely lose the inital order of the sentences, thing that we might want to preserve if we later store these embeddings to a
#    dataframe with their associated sentence, and that we want to retrieve the neighboring sentences to a given one.
#  - My solution is simply to perform the mean on the non-padded tokens. For that, I use the 'attention_mask' tensor to get the positions of the non-padded tokens (they all have an 
#    attention mask of 1), then get the lenght of that index array and perform the mean on the embeddings[:len].
sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
sentence_embeddings = np.array(sentence_embeddings)
sentence_embeddings.shape

(11, 768)

### Word embeddings (sum up last four layers)

In [14]:
# For each sentence, um the last four layers of each token as their embbeding.
sentence_vecs = []
for sent in hidden_states:
    token_vecs = []
    for token in sent:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs.append(np.array(sum_vec))
    sentence_vecs.append(token_vecs)
sentence_vecs = np.array(sentence_vecs)
sentence_vecs.shape

(9, 21, 768)