# BERT Embeddings Tutorial

Reference  
https://github.com/google-research/bert  
https://github.com/SKTBrain/KoBERT  
https://github.com/monologg/KoBERT-Transformers  
https://codlingual.tistory.com/98  
https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#31-running-bert-on-our-text  
https://chloelab.tistory.com/25


In [None]:
!pip install pytorch-pretrained-bert

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#토큰화
text = "After stealing money from the bank vault, the bank robber was seen " \
"fishing on the Mississippi river bank."

#[CLS] 문장의 시작, [SEP] 문장의 끝
marked_text = "[CLS] " + text + " [SEP]" 
tokenized_text = tokenizer.tokenize(marked_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [None]:
#input sentence가 하나면 모두 1, 둘이면 첫 문장은 0, 다음 문장은 1
segments_ids = [1] * len(tokenized_text)

In [None]:
#tensor로 변환하기 
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_ids])

#load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

In [None]:
#no_grad는 forwad하는 동안 컴퓨팅 그래프를 구성하지 않게 함(메모리 소비를 줄이고 속도 향상)
with torch.no_grad():
  encoded_layers, pooled = model(tokens_tensor, segments_tensor)

In [None]:
print(f"""Layers : {len(encoded_layers)} 
Batchs : {len(encoded_layers[0])} 
Tokens : {len(encoded_layers[0][0])} 
Hidden_size : {len(encoded_layers[0][0][0])}""")

Layers : 12 
Batchs : 1 
Tokens : 22 
Hidden_size : 768


In [None]:
#Layer를 하나로 합침
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

torch.Size([12, 1, 22, 768])

In [None]:
#Batch 삭제
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

torch.Size([12, 22, 768])

In [None]:
#Layer와 Token 자리 바꿈
token_embeddings = token_embeddings.permute(1, 0, 2)
token_embeddings.size()

torch.Size([22, 12, 768])

In [None]:
#단어 벡터
token_vec_sum = []
for token in token_embeddings:
  vector = torch.sum(token[-4:], dim=0)
  token_vec_sum.append(vector)

print ('Shape is: %d x %d' % (len(token_vec_sum), len(token_vec_sum[0])))

Shape is: 22 x 768


In [None]:
#문장 벡터
token_vec = encoded_layers[-2][0] #Low level일수록 문법에, High level일수록 문맥을 반영
sentence_embeddings = torch.mean(token_vec, dim=0)

In [None]:
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0 [CLS]
1 after
2 stealing
3 money
4 from
5 the
6 bank
7 vault
8 ,
9 the
10 bank
11 robber
12 was
13 seen
14 fishing
15 on
16 the
17 mississippi
18 river
19 bank
20 .
21 [SEP]


In [None]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vec_sum[6][:5]))
print("bank robber  ", str(token_vec_sum[10][:5]))
print("river bank   ", str(token_vec_sum[19][:5]))

First 5 vector values for each instance of "bank".

bank vault    tensor([ 2.1319, -2.1413, -1.6260,  0.8638,  3.3173])
bank robber   tensor([ 1.1868, -1.5298, -1.3770,  1.0648,  3.1446])
river bank    tensor([ 1.1295, -1.4724, -0.7296, -0.0901,  2.4970])


In [None]:
#동음의이어 단어 벡터의 유사도
from scipy.spatial.distance import cosine
diff_bank = 1 - cosine(token_vec_sum[10], token_vec_sum[19])
same_bank = 1 - cosine(token_vec_sum[10], token_vec_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.95
Vector similarity for *different* meanings:  0.68


# KoBERT Embeddings

KoBERT Requirements
- Python >= 3.6
- PyTorch >= 1.7.0
- MXNet >= 1.4.0
- gluonnlp >= 0.6.0
- sentencepiece >= 0.1.6
- transformers >= 3.5.0

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install gluonnlp
!pip install mxnet
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
#!git clone https://github.com/monologg/KoBERT-Transformers #Huggingface에서 KoBERT 사용을 위한 설치, SKTBrain의 KoBERT와 동일

In [None]:
from KoBERT_Transformers.kobert_transformers.tokenization_kobert import KoBertTokenizer
from KoBERT_Transformers.kobert_transformers.load_model import get_kobert_model

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
model = get_kobert_model()
#Kobert Tokenizer
tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]']

In [None]:
text = "거리에 은행나무가 노랗게 물들었다."\
"은행에 가서 신규 통장을 발급받고 집에와서 먹은 은행은 맛있었다."
marked_text = "[CLS] " + text + " [SEP]"

tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]             2
▁거리             871
에             6,896
▁은행           3,605
나무            5,660
가             5,330
▁노            1,476
랗                 0
게             5,400
▁물            2,135
들             5,931
었다            6,888
.                54
은행            7,087
에             6,896
▁               517
가             5,330
서             6,553
▁신규           3,014
▁통            4,743
장을            7,187
▁발급           2,239
받고            6,289
▁집            4,384
에             6,896
와             6,983
서             6,553
▁먹            2,010
은             7,086
▁은행           3,605
은             7,086
▁맛            1,967
있             7,141
었다            6,888
.                54
[SEP]             3


In [None]:
segments_ids = [1] * len(tokenized_text)

In [None]:
#tensor로 변환
token_ids = torch.tensor([indexed_tokens])
segment_ids = torch.tensor([segments_ids])

In [None]:
from torch import nn
class BERTEmbeddings(nn.Module):
    def __init__(self,
                 bert):
        super(BERTEmbeddings, self).__init__()
        self.bert = bert
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        encoding_layer, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        return encoding_layer

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available else torch.device("cpu")

BERTmodel output

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,

In [None]:
with torch.no_grad():
  output = model(input_ids=token_ids, token_type_ids=segment_ids.long(), output_hidden_states=True, return_dict=True)

In [None]:
output.last_hidden_state.size()

torch.Size([1, 36, 768])

In [None]:
output.pooler_output.size()

torch.Size([1, 768])

Layer = Input Embedding + 12 BERTLayers

In [None]:
encoded_layers = output.hidden_states
len(encoded_layers)

13

In [None]:
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([36, 13, 768])

In [None]:
#단어 벡터
#성능 향상을 위해 마지막 4개의 레이어를 더한다.
token_vec_sum = []
for token in token_embeddings:
  vector = torch.sum(token[-4:], dim=0)
  token_vec_sum.append(vector)
  
print ('Shape is: %d x %d' % (len(token_vec_sum), len(token_vec_sum[0])))

#문장 벡터
#간단한 방법으로 두번째 레이어의 평균값을 가져온다.
token_vec = encoded_layers[-2][0]
sentence_embeddings = torch.mean(token_vec, dim=0)

Shape is: 36 x 768


In [None]:
sentence_embeddings.shape

torch.Size([768])

In [None]:
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0 [CLS]
1 ▁거리
2 에
3 ▁은행
4 나무
5 가
6 ▁노
7 랗
8 게
9 ▁물
10 들
11 었다
12 .
13 은행
14 에
15 ▁
16 가
17 서
18 ▁신규
19 ▁통
20 장을
21 ▁발급
22 받고
23 ▁집
24 에
25 와
26 서
27 ▁먹
28 은
29 ▁은행
30 은
31 ▁맛
32 있
33 었다
34 .
35 [SEP]


In [None]:
#동음의이어 단어 벡터의 유사도(문맥이 반영됨)
diff = 1 - cosine(token_vec_sum[3], token_vec_sum[13]) #은행나무 vs 은행(bank)
same = 1 - cosine(token_vec_sum[3], token_vec_sum[29]) #은행나무 vs 은행(ginkog nut)

print('Vector similarity for  *similar*  meanings:  %.2f' % same)
print('Vector similarity for *different* meanings:  %.2f' % diff)

Vector similarity for  *similar*  meanings:  0.75
Vector similarity for *different* meanings:  0.39


In [None]:
1 - cosine(token_vec_sum[4], token_vec_sum[13])

0.3180385231971741