In [1]:
import torch
import numpy as np

### Sentences

In [2]:
sentences = ['Computer networking may be considered a branch of electrical engineering.',
             'Computer networking is part of the electronics engineering field.',
             'Computer networking is a lot of business.',
             'Any data sent across a network requires time to travel from source to destination.',
             'The information pushed to the network needs time to go from point A to point B.',
             'The travel time of the data is instantaneous.',
             'Firewalls are typically configured to reject access requests from unrecognized sources.',
             'Firewalls are usually set up to refuse access requests from unknown sources.',
             'Firewalls allow actions from all foreign sources.',
             'Short sentence.',
             'Very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.',
             ]

## 1. Tokenization

### 1.1. With BertTokenizer (from transformers)

#### 1.1.1. Step by step

In [3]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

# Load BERT WrdPiece tokenizer.
tokenizer = BertTokenizer.from_pretrained('/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/')

# Sentence example.
sent_id = 0
print("- Original sentence:  {}\n".format(sentences[sent_id]))

marked_text = ["[CLS] " + sent + " [SEP]" for sent in sentences]
print("- Add special tokens ([CLS] and [SEP]):  {}\n".format(marked_text[sent_id]))

tokenized_text = [tokenizer.tokenize(sent) for sent in marked_text]
print("- Convert words to WordPiece tokens:  {}\n".format(tokenized_text[sent_id]))

indexed_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text]
print("- Converting WordPiece tokens to their vocab ids (0 -> 30522):  {}\n".format(indexed_tokens[sent_id]))

# Define length of longest sentence in our batch.
lengths = [len(i) for i in indexed_tokens]
max_len = max(lengths) if max(lengths) <= 512 else 512

padded_tokens = pad_sequences(indexed_tokens, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
print("- Padd/truncate tokenized sentences to {} tokens:  {}\n".format(max_len, padded_tokens[sent_id]))

attention_masks = np.where(padded_tokens != 0, 1, 0)  #returns ndarray which is 1 if padded_tokens != 0 is True and 0 if False.
print("- Create attention masks:  {}\n".format(attention_masks[sent_id]))

input_ids = torch.tensor(padded_tokens)  
attention_masks = torch.tensor(attention_masks)
print("- Convert tokens and masks to pytorch tensors:".format(input_ids[sent_id]))
print("  * Input:  {}".format(input_ids[sent_id]))
print("  * Masks:  {}".format(attention_masks[sent_id]))

- Original sentence:  Computer networking may be considered a branch of electrical engineering.

- Add special tokens ([CLS] and [SEP]):  [CLS] Computer networking may be considered a branch of electrical engineering. [SEP]

- Convert words to WordPiece tokens:  ['[CLS]', 'Computer', 'networking', 'may', 'be', 'considered', 'a', 'branch', 'of', 'electrical', 'engineering', '.', '[SEP]']

- Converting WordPiece tokens to their vocab ids (0 -> 30522):  [101, 6701, 16074, 1336, 1129, 1737, 170, 3392, 1104, 6538, 3752, 119, 102]

- Padd/truncate tokenized sentences to 40 tokens:  [  101  6701 16074  1336  1129  1737   170  3392  1104  6538  3752   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]

- Create attention masks:  [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]

- Convert tokens and masks to pytorch tensors:
  * Input:  t

Using TensorFlow backend.


#### 1.1.2. With less steps

In [4]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

# Load BERT WrdPiece tokenizer.
tokenizer = BertTokenizer.from_pretrained('/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/')

# Sentence example.
sent_id = 0
print("- Original sentence:  {}\n".format(sentences[sent_id]))

# `encode` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Map tokens to their IDs.
indexed_tokens = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
print("- Convert words to vocab ids (0 -> 30522):  {}\n".format(indexed_tokens[sent_id]))

# Define length of longest sentence in our batch.
lengths = [len(i) for i in indexed_tokens]
max_len = max(lengths) if max(lengths) <= 512 else 512

padded_tokens = pad_sequences(indexed_tokens, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
print("- Padd/truncate tokenized sentences to {} tokens:  {}\n".format(max_len, padded_tokens[sent_id]))

attention_masks = np.where(padded_tokens != 0, 1, 0)  #returns ndarray which is 1 if padded_tokens != 0 is True and 0 if False.
print("- Create attention masks:  {}\n".format(attention_masks[sent_id]))

input_ids = torch.tensor(padded_tokens)  
attention_masks = torch.tensor(attention_masks)
print("- Convert tokens and masks to pytorch tensors:".format(input_ids[sent_id]))
print("  * Input:  {}".format(input_ids[sent_id]))
print("  * Masks:  {}".format(attention_masks[sent_id]))

- Original sentence:  Computer networking may be considered a branch of electrical engineering.

- Convert words to vocab ids (0 -> 30522):  [101, 6701, 16074, 1336, 1129, 1737, 170, 3392, 1104, 6538, 3752, 119, 102]

- Padd/truncate tokenized sentences to 40 tokens:  [  101  6701 16074  1336  1129  1737   170  3392  1104  6538  3752   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]

- Create attention masks:  [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]

- Convert tokens and masks to pytorch tensors:
  * Input:  tensor([  101,  6701, 16074,  1336,  1129,  1737,   170,  3392,  1104,  6538,
         3752,   119,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
  

#### 1.1.3. In one step

In [59]:
from transformers import BertTokenizer

# Load BERT WrdPiece tokenizer.
tokenizer = BertTokenizer.from_pretrained('/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/')

# Sentence example.
sent_id = 0
print("- Original sentence:  {}\n".format(sentences[sent_id]))

# `encode_plus` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Map tokens to their IDs.
#   (5) Pad or truncate the sentence to `max_length`
#   (6) Create attention masks for [PAD] tokens.
results = [tokenizer.encode_plus(sent, add_special_tokens=True, max_length=512, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') for sent in sentences]

# Extract the token ids.
input_ids = [sent_dict['input_ids'] for sent_dict in results]
input_ids = torch.cat(input_ids, dim=0)

# Extract the attention masks.
attention_masks = [sent_dict['attention_mask'] for sent_dict in results]
attention_masks = torch.cat(attention_masks, dim=0)

print("  * Input:  {}".format(input_ids[sent_id]))
print("  * Masks:  {}".format(attention_masks[sent_id]))

- Original sentence:  Computer networking may be considered a branch of electrical engineering.

  * Input:  tensor([  101,  6701, 16074,  1336,  1129,  1737,   170,  3392,  1104,  6538,
         3752,   119,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     

### 1.2. With BertWordPieceTokenizer (from tokenizers)
NB: This one is much faster (they can encode 1GB of text in ~20sec on a standard server's CPU)!

In [61]:
from tokenizers import BertWordPieceTokenizer
from keras.preprocessing.sequence import pad_sequences


# Load tokenizer.
tokenizer = BertWordPieceTokenizer('/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/vocab.txt',
                                    add_special_tokens=True,
                                    lowercase=False,
                                    clean_text=True, 
                                    handle_chinese_chars=True,
                                    strip_accents=True)

# Sentence example.
sent_id = 0
print("- Original sentence:  {}\n".format(sentences[sent_id]))

# `encode` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Map tokens to their IDs.
outputs = [tokenizer.encode(sent) for sent in sentences]
indexed_tokens = [out.ids for out in outputs]
print("- Convert words to vocab ids (0 -> 30522):  {}\n".format(indexed_tokens[sent_id]))

# Define length of longest sentence in our batch.
lengths = [len(i) for i in indexed_tokens]
max_len = max(lengths) if max(lengths) <= 512 else 512

padded_tokens = pad_sequences(indexed_tokens, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
print("- Padd/truncate tokenized sentences to {} tokens:  {}\n".format(max_len, padded_tokens[sent_id]))

attention_masks = np.where(padded_tokens != 0, 1, 0)  #returns ndarray which is 1 if padded_tokens != 0 is True and 0 if False.
print("- Create attention masks:  {}\n".format(attention_masks[sent_id]))

input_ids = torch.tensor(padded_tokens)  
attention_masks = torch.tensor(attention_masks)
print("- Convert tokens and masks to pytorch tensors:".format(input_ids[sent_id]))
print("  * Input:  {}".format(input_ids[sent_id]))
print("  * Masks:  {}".format(attention_masks[sent_id]))

- Original sentence:  Computer networking may be considered a branch of electrical engineering.

- Convert words to vocab ids (0 -> 30522):  [101, 6701, 16074, 1336, 1129, 1737, 170, 3392, 1104, 6538, 3752, 119, 102]

- Padd/truncate tokenized sentences to 40 tokens:  [  101  6701 16074  1336  1129  1737   170  3392  1104  6538  3752   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]

- Create attention masks:  [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]

- Convert tokens and masks to pytorch tensors:
  * Input:  tensor([  101,  6701, 16074,  1336,  1129,  1737,   170,  3392,  1104,  6538,
         3752,   119,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
  

## 2. Encode

In [35]:
from transformers import BertModel


# Load pretrained model.
model = BertModel.from_pretrained(pretrained_model_name_or_path='/raid/antoloui/Master-thesis/Code/_models/netbert-1027000/', 
                                  output_hidden_states=True, # Will output all hidden_states.
                                  cache_dir ='../../cache') 

with torch.no_grad():
    # output is a 2-tuple where:
    #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
    #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
    #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
    output = model(input_ids, attention_mask=attention_masks)

# Get individual components of the output.
last_hidden_states = output[0]
pooler_output = output[1]
hidden_states = output[2]

# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
hidden_states = torch.stack(hidden_states, dim=0)

# Switch around the “layers” and “tokens” dimensions with permute.
hidden_states = hidden_states.permute(1,2,0,3)

# Print dimensions of output.
print("Dimensions of hidden_states: {}".format(hidden_states.size()))
print("   - Number of layers (+1 with initial token embeddings): {}".format(hidden_states.size(2)))
print("   - Number of sentences: {}".format(hidden_states.size(0)))
print("   - Number of tokens in a sentence: {}".format(hidden_states.size(1)))
print("   - Dimension of an embedding : {}".format(hidden_states.size(3)))

Dimensions of hidden_states: torch.Size([11, 64, 13, 768])
   - Number of layers (+1 with initial token embeddings): 13
   - Number of sentences: 11
   - Number of tokens in a sentence: 64
   - Dimension of an embedding : 768


### Sentence embeddings (average last layer)

In [36]:
# For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
#
# Note:
#  - In practice, batches of sentences will be encoded at the same time. The problem is that the tokenization step will add as many [PAD] token as the longest sentence in the batch.
#    If we simply compute the average of all tokens embeddings as the embedding of the sentence, the latter could be really messed up if it is a very small sentence in a batch of
#    very long sentences, as the average embedding will converge towards the one of the [PAD] token (as it is the most represented in that sequence).
#  - One solution would be to sort the sentences by length before encoding them. That way, the sentences in a batch sampled sequentially will approximately all have the same length.
#    What I don't like with this solution is that we completely lose the inital order of the sentences, thing that we might want to preserve if we later store these embeddings to a
#    dataframe with their associated sentence, and that we want to retrieve the neighboring sentences to a given one.
#  - My solution is simply to perform the mean on the non-padded tokens. For that, I use the 'attention_mask' tensor to get the positions of the non-padded tokens (they all have an 
#    attention mask of 1), then get the lenght of that index array and perform the mean on the embeddings[:len].
sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_masks)]
sentence_embeddings = np.array(sentence_embeddings)
sentence_embeddings.shape

(11, 768)

### Word embeddings (sum up last four layers)

In [37]:
# For each sentence, sum the last four layers of each token as their embbeding.
sentence_vecs = []
for sent in hidden_states:
    token_vecs = []
    for token in sent:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs.append(np.array(sum_vec))
    sentence_vecs.append(token_vecs)
sentence_vecs = np.array(sentence_vecs)
sentence_vecs.shape

(11, 64, 768)