#### Prerequisites

In [None]:
%%capture

!pip install transformers
!pip install nltk

#### Imports 

In [2]:
from transformers import BertModel, BertTokenizerFast
import transformers
import logging
import torch
import nltk

#### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'Using Transformers: {transformers.__version__}')
logger.info(f'Using Torch: {torch.__version__}')
logger.info(f'Using NLTK: {nltk.__version__}')

Using Transformers: 4.18.0
Using Torch: 1.8.1
Using NLTK: 3.6.7


#### Load BERT and NLTK tokenizers

In [5]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [7]:
sent_tokenizer = nltk.load('./data/english.pickle')

['Great product.', 'Good design.', 'But a little pricey!']

#### Tokenize text to sentences and tokens

In [None]:
text = "Great product. Good design. But a little pricey!"

In [None]:
sentences = sent_tokenizer.tokenize(text)
sentences

In [8]:
tokens = bert_tokenizer.tokenize(sentences[0])
tokens

['great', 'product', '.']

In [9]:
input_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2307, 4031, 1012]

##### BERT has 2 constraints
* All sentences must be padded or truncated to a fixed length
* Max sentence length is 512 tokens

#### Determine the max tokens length

In [10]:
max_len = 0
for sentence in sentences:
    input_ids = bert_tokenizer.encode(sentence, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
max_len

8

#### Encode sentences

In [11]:
input_ids = []
attention_masks = []

In [12]:
for sentence in sentences:
    encoded_dict = bert_tokenizer.encode_plus(sentence, 
                                              add_special_tokens=True,
                                              max_length=max_len,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              truncation=True)
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [13]:
input_ids

[tensor([[ 101, 2307, 4031, 1012,  102,    0,    0,    0]]),
 tensor([[ 101, 2204, 2640, 1012,  102,    0,    0,    0]]),
 tensor([[ 101, 2021, 1037, 2210, 3976, 2100,  999,  102]])]

In [14]:
attention_masks

[tensor([[1, 1, 1, 1, 1, 0, 0, 0]]),
 tensor([[1, 1, 1, 1, 1, 0, 0, 0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1]])]

In [15]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [16]:
input_ids

tensor([[ 101, 2307, 4031, 1012,  102,    0,    0,    0],
        [ 101, 2204, 2640, 1012,  102,    0,    0,    0],
        [ 101, 2021, 1037, 2210, 3976, 2100,  999,  102]])

#### Load BERT model

In [17]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Get last hidden state

In [18]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)

[2022-12-16 17:28:50.281 pytorch-1-8-gpu-py-ml-g4dn-4xlarge-ebd0c963d7cb3d49c063789f9c22:1409 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-12-16 17:28:50.370 pytorch-1-8-gpu-py-ml-g4dn-4xlarge-ebd0c963d7cb3d49c063789f9c22:1409 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [19]:
last_hidden_state = outputs.last_hidden_state[:, 0, :]
last_hidden_state

tensor([[-0.3160,  0.1158, -0.3213,  ..., -0.2726,  0.3246,  0.2845],
        [-0.2732,  0.1145, -0.2756,  ..., -0.2125,  0.0405,  0.6684],
        [-0.2529,  0.1539, -0.4362,  ..., -0.0796,  0.1113,  0.4073]])

In [20]:
last_hidden_state.shape

torch.Size([3, 768])

#### Compute output vector

In [21]:
sentence_vectors = last_hidden_state.detach().numpy()

In [22]:
sentence_vectors

array([[-0.31601778,  0.11579025, -0.32130766, ..., -0.27264526,
         0.32456014,  0.2845447 ],
       [-0.27318975,  0.11449272, -0.27562585, ..., -0.21247703,
         0.04046102,  0.66837615],
       [-0.25293225,  0.15385807, -0.43620795, ..., -0.07957672,
         0.11127631,  0.4073293 ]], dtype=float32)

In [30]:
paragraph_vector = sentence_vectors.mean(axis=0)  # mean of each column
paragraph_vector = paragraph_vector.tolist()
len(paragraph_vector)

768