<img src="img/bigthings.png" style="width:600px;">

# Solving Natural Language problems with scarce data

# Simple notebook to show basic BERT usage

## Tokenizing text with BERT

In [1]:
import transformers
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') 

In [2]:
sample_text = "I'm tokenizing some text"
tokenizer.encode(sample_text, add_special_tokens=True)

[101, 151, 112, 155, 16925, 13649, 21593, 10970, 14059, 102]

In [3]:
for token in tokenizer.encode(sample_text, add_special_tokens=True):
    print(f"{tokenizer.decode([token])} -> {token}")

[CLS] -> 101
i -> 151
' -> 112
m -> 155
tok -> 16925
##eni -> 13649
##zing -> 21593
some -> 10970
text -> 14059
[SEP] -> 102


## Computing contextualized embeddings with BERT

In [4]:
import torch
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [5]:
sample_text = "I'm embedding some text using context!"
with torch.no_grad():
    outputs = model(torch.tensor([tokenizer.encode(sample_text, add_special_tokens=True)]))
embeddings = outputs[0][0]
print(embeddings.shape)
print(embeddings)

torch.Size([13, 768])
tensor([[-0.1530, -0.0948, -0.0012,  ..., -0.0225, -0.0688, -0.0611],
        [ 0.3417,  0.1961,  0.1746,  ..., -0.1793,  0.1639, -0.3618],
        [ 0.1743,  0.4308,  0.7249,  ...,  0.2688,  0.1593, -0.1433],
        ...,
        [-0.0493, -0.0772,  0.1372,  ..., -0.1875,  0.5819, -0.2562],
        [-0.0976,  0.4797,  0.1691,  ..., -0.0733,  0.2960, -0.3994],
        [ 0.1321,  0.4781, -0.4358,  ...,  0.2372,  0.3368, -0.4438]])
