In [1]:
import pandas as pd
import torch

# 加载训练数据和测试数据
train_dataframe = pd.read_csv('../data_and_baseline/train_data_public.csv')
test_dataframe = pd.read_csv('../data_and_baseline/test_public.csv')

# 将sentence-level信息切分为character-level的信息
train_dataframe['BIO_anno'] = train_dataframe['BIO_anno'].apply(lambda x: x.split(' '))  # label
train_dataframe['training_data_text'] = train_dataframe.apply(lambda row: list(row['text']), axis=1)
test_dataframe['testing_data_text'] = test_dataframe.apply(lambda row: list(row['text']), axis=1)
test_dataframe.head()

Unnamed: 0,id,text,testing_data_text
0,0,共享一个额度，没啥必要，四个卡不要年费吗？你这种人头，银行最喜欢，广发是出了名的风控严，套现...,"[共, 享, 一, 个, 额, 度, ，, 没, 啥, 必, 要, ，, 四, 个, 卡, ..."
1,1,炸了，就2000.浦发没那么好心，草,"[炸, 了, ，, 就, 2, 0, 0, 0, ., 浦, 发, 没, 那, 么, 好, ..."
2,2,挂了电话自己打过去分期提额可以少分一点的,"[挂, 了, 电, 话, 自, 己, 打, 过, 去, 分, 期, 提, 额, 可, 以, ..."
3,3,比如你首卡10k，二卡也10k，信报上显示邮政总共给你的授信额度是20k,"[比, 如, 你, 首, 卡, 1, 0, k, ，, 二, 卡, 也, 1, 0, k, ..."
4,4,3000吗，浦发总是这样,"[3, 0, 0, 0, 吗, ，, 浦, 发, 总, 是, 这, 样]"


In [2]:
training_data_text_list = []
testing_data_text_list = []
for i in range(len(train_dataframe)):
    training_data_text_list.append(train_dataframe.iloc[i]['training_data_text'])
for i in range(len(test_dataframe)):
    testing_data_text_list.append(test_dataframe.iloc[i]['testing_data_text'])

In [3]:
from transformers import BertTokenizer

access_token = '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'
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext', use_auth_token=access_token)
tokenizer

PreTrainedTokenizer(name_or_path='hfl/chinese-roberta-wwm-ext', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
from sklearn.model_selection import KFold


class SA_dataloader(DataLoader):
    def __init__(self, dataset: TensorDataset, random: bool):
        if random:
            self.sampler = RandomSampler(dataset)
        else:
            self.sampler = SequentialSampler(dataset)
        super().__init__(dataset, sa_config.batch_size, sampler=self.sampler)

In [5]:
from torch.utils.data import TensorDataset
from SA_config import *


# 将每一句转成数字（大于510做截断，小于510做PADDING，加上首尾两个标识，长度总共等于512）
def convert_text_to_token(tokenizer, sentence, limit_size=510):
    tokens = tokenizer.encode(sentence[:limit_size])  # 直接截断
    # 补齐（pad的索引号就是0）
    if len(tokens) < limit_size + 2:
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens


# 建立mask
def attention_masks(input_tokens):
    atten_masks = []
    for seq in input_tokens:
        # 如果有编码（>0）即为1, pad为0
        seq_mask = [float(x > 0) for x in seq]
        atten_masks.append(seq_mask)
    return torch.tensor(atten_masks, dtype=torch.bool)


# 对每个句子进行编码
def input_tokens(tokenizer, data_text_list):
    tokens = torch.tensor([convert_text_to_token(tokenizer, x, sa_config.max_position_embeddings - 2) for x in
                           data_text_list])
    return tokens


# train_dataloader: tokens_id,attention-masks,classess
train_data_class = torch.tensor(train_dataframe['class'].values)
train_data_sentence_list = list(train_dataframe['text'].values)
test_data_sentence_list = list(test_dataframe['text'].values)
test_input_tokens = input_tokens(tokenizer, test_data_sentence_list)
train_input_tokens = input_tokens(tokenizer, train_data_sentence_list)
train_attention_tokens = attention_masks(train_input_tokens)
test_attention_tokens = attention_masks(test_input_tokens)
test_dataset = TensorDataset(test_input_tokens, test_attention_tokens)
test_dataloader = SA_dataloader(test_dataset, False)

In [6]:
from transformers import BertModel, BertPreTrainedModel


class Roberta_FocalLoss(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext', config=config)
        self.attention = nn.MultiheadAttention(embed_dim=config.hidden_size,
                                               num_heads=config.num_heads, batch_first=True)
        self.GRU = nn.GRU(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, bidirectional=True)
        self.classifier = nn.Linear(in_features=config.hidden_size * 2, out_features=config.num_classes)
        self.softmax = nn.Softmax(dim=1)
        self.loss_function = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, classes=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        output, pooler_output = outputs[0], outputs[1]
        attention_output, _ = self.attention(output, output, output)
        gru_output, _ = self.GRU(attention_output)
        concat_output = torch.cat((gru_output[:, -1, :], pooler_output), dim=1)
        logits = self.classifier(concat_output)
        logits = self.softmax(logits)
        output = (logits,)
        if classes is not None:
            classes = classes.reshape(-1, 1)
            one_hot = torch.zeros(sa_config.batch_size, sa_config.num_classes, device=sa_config.device).scatter_(1,
                                                                                                                 classes,
                                                                                                                 1)
            loss = self.loss_function(logits, one_hot)
            output = (loss,)
        return output

In [7]:
from torch import nn
from tqdm import tqdm


def SA_train_epoch(train_loader, model, optimizer, scheduler, epoch, config):
    # set model to training mode
    model.train()
    # fgm = FGM(model)
    # step number in one epoch: 336
    train_losses = 0
    for _, batch_samples in enumerate(tqdm(train_loader)):
        batch_input, batch_masks, batch_classes = batch_samples
        batch_input, batch_masks, batch_classes = batch_input.to(
            config.device), batch_masks.to(config.device), batch_classes.to(
            config.device)
        # compute model output and loss
        loss = \
            model(input_ids=batch_input, attention_mask=batch_masks, classes=batch_classes)[0]
        train_losses += loss.item()
        # clear previous gradients, compute gradients of all variables wrt loss
        model.zero_grad()
        loss.backward()
        # fgm.attack()
        loss_adv = model(input_ids=batch_input, attention_mask=batch_masks, classes=batch_classes)[0]
        loss_adv.backward()
        # fgm.restore()
        # gradient clipping
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip_grad)
        # performs updates using calculated gradients
    optimizer.step()
    scheduler.step()
    train_loss = float(train_losses) / len(train_loader)
    print("Epoch: {}, train loss: {}".format(epoch, train_loss))


def SA_train(train_loader, eval_dataloader, model, optimizer, scheduler, config):
    """train the model and test model performance"""
    # reload weights from restore_dir if specified
    model.to(config.device)
    best_val_f1 = 0.0
    patience_counter = 0
    # start training
    for epoch in range(1, config.epoch_num + 1):
        SA_train_epoch(train_loader, model, optimizer, scheduler, epoch, config)
        val_metrics = evaluate(model, eval_dataloader)
        val_f1 = val_metrics['f1']
        print("Epoch: {}, f1 score: {}".format(epoch, val_f1))
        improve_f1 = val_f1 - best_val_f1
        if improve_f1 > 1e-5:
            best_val_f1 = val_f1
            if improve_f1 < config.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1
        # Early stopping and logging best f1
        if (patience_counter >= config.patience_num and epoch > config.min_epoch_num) or epoch == config.epoch_num:
            print("Best val f1: {}".format(best_val_f1))
            break
    print("Training Finished!")

In [8]:
import numpy as np


# 计算模型的f1 score。
def class_acc(preds, labels):  # preds.shape=(16, 2) labels.shape=torch.Size([16, 1])
    # eq里面的两个参数的shape=torch.Size([16])
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    if 0:
        print('binary acc ********')
        print('preds = ', preds)
        print('labels = ', labels)
        print('correct = ', correct)
    acc = correct.sum().item() / len(correct)
    return acc


def predict(model, inputs, masks):
    model.eval()
    inputs, masks = inputs.to(sa_config.device), masks.to(sa_config.device)
    batch_output = model(input_ids=inputs, attention_mask=masks, classes=None)[0]
    batch_output = torch.max(batch_output, dim=1).indices
    return batch_output


def evaluate(model, dataloader):
    avg_acc = []
    # 表示进入测试模式
    model.eval()
    device = sa_config.device
    with torch.no_grad():
        for batch in tqdm(dataloader):
            # 从batch中取数据，并放到GPU中
            b_input_ids, b_input_mask, b_classes = batch[0].long().to(device), batch[1].long().to(device), batch[
                2].long().to(device)
            # 前向传播，得到output
            output = model(b_input_ids, attention_mask=b_input_mask, classes=None)[0]
            # 统计当前batch的acc
            acc = class_acc(output, b_classes)
            avg_acc.append(acc)
    # 统计平均acc
    avg_acc = np.array(avg_acc).mean()
    print("平均acc:{}".format(avg_acc))
    return avg_acc

In [9]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch import nn
from sklearn.model_selection import StratifiedKFold
from SA_config import *

model = Roberta_FocalLoss(config=sa_config)
model.to(sa_config.device)
optimizer = AdamW(model.parameters(), lr=sa_config.learning_rate)
SA_kfold = StratifiedKFold(n_splits=5).split(train_input_tokens, train_data_class)
for train_index, eval_index in SA_kfold:
    train_data = TensorDataset(train_input_tokens[train_index], train_attention_tokens[train_index],
                               train_data_class[train_index])
    eval_data = TensorDataset(train_input_tokens[eval_index], train_attention_tokens[eval_index],
                              train_data_class[eval_index])
    train_dataloader = SA_dataloader(train_data, True)
    eval_dataloader = SA_dataloader(eval_data, False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=sa_config.epoch_num * len(train_dataloader))
    SA_train(train_dataloader, eval_dataloader, model, optimizer, scheduler, sa_config)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 19%|█▉        | 1155/6022 [03:07<13:08,  6.17it/s]


KeyboardInterrupt: 

In [None]:
result = pd.DataFrame(columns=['id', 'class'])
result['id'] = test_dataframe['id']
for index, data in enumerate(tqdm(test_dataloader)):
    tokens, masks = data
    output = predict(model, tokens, masks)
    for b in range(len(output)):
        result.loc[index * sa_config.batch_size + b, 'class'] = output[b]
result