In [1]:
import pandas as pd
import torch

train_data = pd.read_csv('./train_data_public.csv')
train_data.drop('class', axis=1, inplace=True)
test_data = pd.read_csv('./test_public.csv')

train_data['BIO_anno'] = train_data['BIO_anno'].apply(lambda x: x.split(' '))
train_data['training_data_text'] = train_data.apply(lambda row: list(row['text']), axis=1)
test_data['testing_data_text'] = test_data.apply(lambda row: list(row['text']), axis=1)
train_data.head()

Unnamed: 0,id,text,BIO_anno,training_data_text
0,0,交行14年用过，半年准备提额，却直接被降到1Ｋ，半年期间只T过一次三千，其它全部真实消费，第...,"[B-BANK, I-BANK, O, O, O, O, O, O, O, O, O, O,...","[交, 行, 1, 4, 年, 用, 过, ，, 半, 年, 准, 备, 提, 额, ，, ..."
1,1,单标我有了，最近visa双标返现活动好,"[B-PRODUCT, I-PRODUCT, O, O, O, O, O, O, B-PRO...","[单, 标, 我, 有, 了, ，, 最, 近, v, i, s, a, 双, 标, 返, ..."
2,2,建设银行提额很慢的……,"[B-BANK, I-BANK, I-BANK, I-BANK, B-COMMENTS_N,...","[建, 设, 银, 行, 提, 额, 很, 慢, 的, …, …]"
3,3,我的怎么显示0.25费率，而且不管分多少期都一样费率，可惜只有69k,"[O, O, O, O, O, O, O, O, O, O, B-COMMENTS_N, I...","[我, 的, 怎, 么, 显, 示, 0, ., 2, 5, 费, 率, ，, 而, 且, ..."
4,4,利率不错，可以撸,"[B-COMMENTS_N, I-COMMENTS_N, B-COMMENTS_ADJ, I...","[利, 率, 不, 错, ，, 可, 以, 撸]"


In [2]:
training_data_text_list = []
testing_data_text_list = []
for i in range(len(train_data)):
    training_data_text_list.append(train_data.iloc[i]['training_data_text'])
for i in range(len(test_data)):
    testing_data_text_list.append(test_data.iloc[i]['testing_data_text'])

In [3]:
from transformers import BertTokenizer
from transformers import BertConfig

config = BertConfig.from_pretrained('bert-base-chinese')
config.num_classes = 9
config.clip_grad = 5
config.epoch_num = 10
config.min_epoch_num = 4
config.patience = 0.0002
config.patience_num = 10
config.learning_rate = 1e-5
config.batch_size = 16
config.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
access_token = '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'
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', use_auth_token=access_token)
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
# 将每一句转成数字（大于510做截断，小于510做PADDING，加上首尾两个标识，长度总共等于512）
def convert_text_to_token(tokenizer, sentence, limit_size=510):
    tokens = tokenizer.encode(sentence[:limit_size])  # 直接截断
    # 补齐（pad的索引号就是0）
    if len(tokens) < limit_size + 2:
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens


def covert_anno_to_token(anno_list, limit_size=510):
    START_TAG, STOP_TAG = '[CLS]', '[SEP]'
    tag_to_ix = {"O": 1, "B-BANK": 2, "I-BANK": 3, "B-PRODUCT": 4, 'I-PRODUCT': 5,
                 'B-COMMENTS_N': 6, 'I-COMMENTS_N': 7, 'B-COMMENTS_ADJ': 8,
                 'I-COMMENTS_ADJ': 9, START_TAG: 10, STOP_TAG: 11}
    token_list = []
    if anno_list:
        token_list.append(tag_to_ix[START_TAG])
    else:
        return [0] * (limit_size + 2)
    anno_list = anno_list[:limit_size]
    for i in range(len(anno_list)):
        token_list.append(tag_to_ix[anno_list[i]])
    token_list.append(tag_to_ix[STOP_TAG])
    if len(token_list) < limit_size + 2:
        token_list.extend([0] * (limit_size + 2 - len(token_list)))
    return token_list


# 建立mask
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:
        # 如果有编码（>0）即为1, pad为0
        seq_mask = [float(x > 0) for x in seq]
        atten_masks.append(seq_mask)
    return atten_masks


# 对每个句子进行编码
input_ids = [convert_text_to_token(tokenizer, x, config.max_position_embeddings - 2) for x in training_data_text_list]
# 放到tensor中
input_tokens = torch.tensor(input_ids)
input_targets = [covert_anno_to_token(x, config.max_position_embeddings - 2) for x in train_data['BIO_anno']]
total_targets = torch.tensor(input_targets)
# 生成attention_masks
atten_masks = attention_masks(input_ids)
# 将atten_masks放到tensor中
attention_tokens = torch.tensor(atten_masks)
total_targets[0]

tensor([10,  2,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  6,  7,  1,  1,  1,
         1,  1,  8,  9,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  6,  7,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  6,  7,  1,  1,  6,  7,  1,  1,  1,  1,  4,  5,  1,  1,  1,  1,  8,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 11,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [5]:
from sklearn.model_selection import train_test_split

# 使用random_state固定切分方式，切分 train_inputs, train_labels, train_masks,
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens,
                                                                        total_targets, random_state=2021,
                                                                        test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens, random_state=2021, test_size=0.2)
print(train_inputs.shape, test_inputs.shape)  # torch.Size([8000, 128]) torch.Size([2000, 128])
print(train_masks.shape, test_masks.shape)  # torch.Size([8000, 128])和train_inputs形状一样

torch.Size([6022, 512]) torch.Size([1506, 512])
torch.Size([6022, 512]) torch.Size([1506, 512])


In [6]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler

# 使用TensorDataset对tensor进行打包
BATCH_SIZE = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
# 无放回地随机采样样本元素
train_sampler = RandomSampler(train_data)
# 对于训练集，random sampler, shuffle
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [7]:
train_inputs.shape

torch.Size([6022, 512])

In [8]:
def generate_label_starts(dataloader):
    batch_label_start = []
    for batch in dataloader:
        sentences = [x[0] for x in batch]
        end = []
        max_len = 0
        batch_size = batch[0].shape[0]
        for sentence in sentences:
            for i in range(len(sentence)):
                if sentence[i] == 102:
                    max_len = max(max_len, i - 1)
                    break
        end.extend([max_len] * batch_size)
        batch_label_start.append(end)
    return batch_label_start


train_label_starts = generate_label_starts(train_dataloader)
test_label_starts = generate_label_starts(test_dataloader)
train_batch_dataloader = []
idx = 0
for batch in train_dataloader:
    batch.append(torch.tensor(train_label_starts[idx]))
    idx += 1
    train_batch_dataloader.append(batch)
train_dataloader = DataLoader(train_batch_dataloader)

In [9]:
from torchcrf import CRF
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel
from transformers.models.bert.modeling_bert import *
from torch import nn


class Bert_BiLSTM_CRF(BertPreTrainedModel):
    def __init__(self, config, num_classes):
        super().__init__(config)
        self.bert = BertModel.from_pretrained('bert-base-chinese', config=config)
        self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, num_layers=2,
                            bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.num_classes = num_classes
        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
        self.crf = CRF(num_tags=self.num_classes, batch_first=True)
        self.init_weights()

    def forward(self, input_ids, input_tokens_start, attention_mask, labels):
        # input_ids (batch_size, seq_len)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # sequence_output (batch_size, seq_len, hidden_size)
        sequence_output = outputs[0]
        origin_sequence_output = sequence_output[:, 1:input_tokens_start[0]]
        # origin_sequence_output (batch_size, max_len, hidden_size)
        padded_sequence_output = pad_sequence(origin_sequence_output, batch_first=True)
        padded_sequence_output = self.dropout(padded_sequence_output)
        lstm_outputs, _ = self.lstm(padded_sequence_output)
        # lstm_outputs (batch_size, max_len, 2 * hidden_size)
        logits = self.classifier(lstm_outputs)
        # logits (batch_size, max_len, num_classes)
        outputs = (logits,)
        if labels is not None:
            loss_mask = labels.gt(0)
            loss = -self.crf(logits, labels, mask=loss_mask)
            outputs = (loss,) + outputs
        return outputs

In [20]:
from tqdm import tqdm


def train_epoch(train_loader, model, optimizer, scheduler, epoch):
    # set model to training mode
    model.train()
    # step number in one epoch: 336
    train_losses = 0
    for _, batch_samples in enumerate(tqdm(train_loader)):
        batch_input, batch_masks, batch_labels, batch_label_start = batch_samples
        batch_input, batch_masks, batch_labels, batch_label_start = batch_input.squeeze(), batch_masks.squeeze(), batch_labels.squeeze(), batch_label_start.squeeze()
        print(batch_input.shape, batch_labels.shape, batch_label_start.shape)
        # compute model output and loss
        loss = \
            model(input_ids=batch_input, attention_mask=batch_masks, labels=batch_labels,
                  input_tokens_start=batch_label_start)[0]
        train_losses += loss.item()
        # clear previous gradients, compute gradients of all variables wrt loss
        model.zero_grad()
        loss.backward()
        # gradient clipping
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip_grad)
        # performs updates using calculated gradients
        optimizer.step()
        scheduler.step()
    train_loss = float(train_losses) / len(train_loader)
    print("Epoch: {}, train loss: {}".format(epoch, train_loss))


def train(train_loader, model, optimizer, scheduler):
    """train the model and test model performance"""
    # reload weights from restore_dir if specified
    num_classes = config.num_classes
    model.to(config.device)
    best_val_f1 = 0.0
    patience_counter = 0
    # start training
    for epoch in range(1, config.epoch_num + 1):
        train_epoch(train_loader, model, optimizer, scheduler, epoch)
        # val_metrics = evaluate(dev_loader, model, mode='dev')
        # val_f1 = val_metrics['f1']
        # print("Epoch: {}, dev loss: {}, f1 score: {}".format(epoch, val_metrics['loss'], val_f1))
        # improve_f1 = val_f1 - best_val_f1
        # if improve_f1 > 1e-5:
        #     best_val_f1 = val_f1
        #     if improve_f1 < config.patience:
        #         patience_counter += 1
        #     else:
        #         patience_counter = 0
        # else:
        #     patience_counter += 1
        # # Early stopping and logging best f1
        # if (patience_counter >= config.patience_num and epoch > config.min_epoch_num) or epoch == config.epoch_num:
        #     print("Best val f1: {}".format(best_val_f1))
        #     break
    print("Training Finished!")


In [22]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

model = Bert_BiLSTM_CRF(config=config, num_classes=config.num_classes)
optimizer = AdamW(model.parameters(), lr=config.learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=config.epoch_num * len(train_dataloader))

train_epoch(train_dataloader, model, optimizer, scheduler, 0)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/377 [00:00<?, ?it/s]

torch.Size([16, 512]) torch.Size([16, 512]) torch.Size([16])


  0%|          | 0/377 [00:11<?, ?it/s]


KeyboardInterrupt: 