# BERT embedding

## Use pytorch-pretrained-BERT [link](https://github.com/huggingface/pytorch-pretrained-BERT)

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained("data/bert_cache/bert-large-uncased-vocab.txt")

# Tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file data/bert_cache/bert-large-uncased-vocab.txt


- get hidden state

In [3]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained("data/bert_cache/bert-large-uncased")
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
# We have a hidden states for each of the 24 layers in model bert-base-uncased
assert len(encoded_layers) == 24

encoded_layers

INFO:pytorch_pretrained_bert.modeling:loading archive file data/bert_cache/bert-large-uncased
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



[tensor([[[ 0.0209, -0.0933, -0.0899,  ..., -0.0462, -0.0979, -0.0050],
          [-0.1881,  0.6580,  0.2152,  ...,  0.5455, -0.0858,  0.4080],
          [-0.8712, -0.1690,  0.2003,  ...,  0.5357, -0.1346,  0.4288],
          ...,
          [-0.3829,  0.1071,  0.3474,  ...,  0.6633, -0.5993, -0.4130],
          [ 0.4142, -0.1914, -1.1008,  ...,  0.1034,  0.5236, -0.3538],
          [-0.2399,  0.0702, -0.6301,  ..., -0.2046,  0.1391,  0.2259]]],
        device='cuda:0'),
 tensor([[[ 0.1202, -0.0804, -0.0579,  ...,  0.1063, -0.1013,  0.0238],
          [-0.2258,  0.4770,  0.0755,  ...,  0.1937, -0.1232,  0.5070],
          [-0.8024, -0.1824,  0.1630,  ...,  0.4806,  0.1115,  0.3375],
          ...,
          [-0.6579,  0.0999,  0.4524,  ...,  0.4764, -0.7529, -0.1294],
          [ 0.3774, -0.1221, -1.1118,  ..., -0.1665,  0.5325, -0.3628],
          [-0.2749, -0.2171, -0.5670,  ..., -0.1113,  0.0711, -0.0443]]],
        device='cuda:0'),
 tensor([[[ 0.2405, -0.1409, -0.1571,  ...,  0.089

- Token prediction

In [4]:
# Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0][masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# assert predicted_token == 'henson'

predicted_token

'permits'

## use bert-embeddings [link](https://bert-embedding.readthedocs.io/en/latest/) (deprecated)

In [1]:
import mxnet
mxnet.__version__

'1.3.1'

Unfortunately, this only support nightly version of MXNET

## use BERT-as-a-Service （deprecated）
[https://github.com/hanxiao/bert-as-service](https://github.com/hanxiao/bert-as-service)

### download the pre-trained model
You can also manually download it [here](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip) (all models [here](https://github.com/google-research/bert#pre-trained-models)).

In [1]:
! cd data && mkdir bert && wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip

'wget' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


### server side
- install the package

In [8]:
!pip install bert-serving-server tensorflow

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


- start the service

In [9]:
# !bert-serving-start -model_dir ./data/bert/uncased_L-24_H-1024_A-16 -num_worker=1
# do not run this in jupyter notebook, run in shell instead

2019-05-08 20:21:18.036060: F tensorflow/python/lib/core/bfloat16.cc:675] Check failed: PyBfloat16_Type.tp_base != nullptr 


### client side

In [3]:
!pip install bert-serving-client

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting bert-serving-client
  Downloading https://mirrors.aliyun.com/pypi/packages/96/ed/23f3d7c1f897e09944c16d066e0e52bb83d0d94fdfe0dc9bb53ce1810ca1/bert_serving_client-1.8.9-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.8.9


In [None]:
from bert_serving.client import BertClient
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])