In [6]:
import torch
from transformers import AutoTokenizer, AutoModel
from keras.preprocessing.sequence import pad_sequences


# Tokenizer and Bert Model
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
embedding = AutoModel.from_pretrained('bert-base-chinese')


# Preprocess
sent = '今天天氣真 Good。'
sent_token = tokenizer.encode(sent)
sent_token_padding = pad_sequences([sent_token], maxlen=10, padding='post', dtype='int')
masks = [[float(value>0) for value in values] for values in sent_token_padding]

print('sent_token:', sent_token)
print('sent_token_padding:', sent_token_padding)
print('\n')

sent_token: [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102]
sent_token_padding: [[ 101  791 1921 1921 3706 4696  100  511  102    0]]




In [2]:
import torch.nn as nn
from transformers import BertModel

# Bert-BiGRU-Classifier
class BERTmodel(nn.Module):
    def __init__(self):
        super(BiGRU, self).__init__()
        self.embedding = BertModel.from_pretrained('bert-base-chinese')
        self.gru = nn.GRU(
            input_size=768,
            hidden_size=768,
            dropout=0.3,
            num_layers=5,
            bidirectional=True,
            batch_first=True,
        )

        self.fc_1 = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, tokens, masks=None):
        # BERT
        embedded, _ = self.embedding(tokens, attention_mask=masks)
        cls_vector = embedded[:, 0, :]
        cls_vector = cls_vector.view(-1, 1, 768)

        # GRU
        _, hidden = self.gru(cls_vector)
        hidden = hidden[-1]

        # Fully-connected layer
        outputs = self.fc_1(hidden.squeeze(0))
        outputs = self.sigmoid(outputs).view(-1)

        return outputs

In [None]:
# Convert
inputs = torch.tensor(sent_token_padding)
masks = torch.tensor(masks)
embedded, _ = embedding(inputs, attention_mask=masks)

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

#collate_fn: 如何將多個樣本的資料連成一個batch丟進 model
#截長補短後要限制attention只注意非pad 的部分

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids



# 初始化一個每次回傳 batch size 個訓練樣本的 DataLoader
# 利用 'collate_fn' 將 list of samples 合併成一個 mini-batch
trainset = MyDataset("train", tokenizer=tokenizer)

BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data
print(tokens_tensors)
print(segments_tensors)
print(masks_tensors)
print(label_ids)

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import time
import os
import re
from itertools import chain
from transformers import BertTokenizer
PRETRAINED_MODEL_NAME = "bert-base-chinese" #英文pretrain(不區分大小寫)

# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))


dict size 21128
token               index          
-------------------------
3100                12100
##祎                 17916
额                    7583
##ino                9846
旅                    3180
##彥                 15560
铲                    7211
夾                    1933
##躪                 19772
##殞                 16717
