In [1]:
!pip install transformers



由于class类别严重不均衡，先进行数据增强

In [None]:
import pandas as pd
import torch
import sys
from SA_config import *

# sys.path.append('../../EDA_NLP_for_Chinese/code')
# from augment import *

# 加载训练数据和测试数据
# train_dataframe = pd.read_csv('../data_and_baseline/train_data_public.csv', index_col='id')
# test_dataframe = pd.read_csv('../data_and_baseline/test_public.csv', index_col='id')
# train_dataframe.drop('BIO_anno', inplace=True, axis=1)
# # 数据增强
# gen_eda(train_dataframe=train_dataframe, alpha=0.1, num_aug=7)
# train_dataframe.to_csv('../data_and_baseline/augmented_train_data.csv')

In [None]:
train_dataframe = pd.read_csv('../data_and_baseline/augmented_train_data.csv', index_col='id')
test_dataframe = pd.read_csv('../data_and_baseline/test_public.csv', index_col='id')

In [None]:
from transformers import BertTokenizer

access_token = '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'
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext', use_auth_token=access_token)
tokenizer

In [None]:
from torch import nn


class LabelSmoothing(nn.Module):
    """NLL loss with label smoothing.
    """

    def __init__(self, smoothing=0.1):
        """Constructor for the LabelSmoothing module.
        :param smoothing: label smoothing factor
        """
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        # 此处的self.smoothing即我们的epsilon平滑参数。

    def forward(self, x, target):
        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

In [None]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
from sklearn.model_selection import KFold
from SA_config import *


class SA_dataloader(DataLoader):
    def __init__(self, dataset: TensorDataset, random: bool):
        if random:
            self.sampler = RandomSampler(dataset)
        else:
            self.sampler = SequentialSampler(dataset)
        super().__init__(dataset, sa_config.batch_size, sampler=self.sampler)

In [None]:
from torch.utils.data import TensorDataset


# 将每一句转成数字（大于510做截断，小于510做PADDING，加上首尾两个标识，长度总共等于512）
def convert_text_to_token(tokenizer, sentence, limit_size=510):
    tokens = tokenizer.encode(sentence[:limit_size])  # 直接截断
    # 补齐（pad的索引号就是0）
    if len(tokens) < limit_size + 2:
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens


# 建立mask
def attention_masks(input_tokens):
    atten_masks = []
    for seq in input_tokens:
        # 如果有编码（>0）即为1, pad为0
        seq_mask = [float(x > 0) for x in seq]
        atten_masks.append(seq_mask)
    return torch.tensor(atten_masks, dtype=torch.bool)


# 对每个句子进行编码
def input_tokens(tokenizer, data_text_list):
    tokens = torch.tensor([convert_text_to_token(tokenizer, x, sa_config.max_position_embeddings - 2) for x in
                           data_text_list])
    return tokens


# train_dataloader: tokens_id,attention-masks,classess
train_data_class = torch.tensor(train_dataframe['class'].values)
train_data_sentence_list = list(train_dataframe['text'].values)
test_data_sentence_list = list(test_dataframe['text'].values)
test_input_tokens = input_tokens(tokenizer, test_data_sentence_list)
train_input_tokens = input_tokens(tokenizer, train_data_sentence_list)
train_attention_tokens = attention_masks(train_input_tokens)
test_attention_tokens = attention_masks(test_input_tokens)
test_dataset = TensorDataset(test_input_tokens, test_attention_tokens)
test_dataloader = SA_dataloader(test_dataset, False)

In [None]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable


class FocalLoss(nn.Module):
    r"""
        This criterion is a implemenation of Focal Loss, which is proposed in
        Focal Loss for Dense Object Detection.

            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])

        The losses are averaged across observations for each minibatch.

        Args:
            alpha(1D Tensor, Variable) : the scalar factor for this criterion
            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
                                   putting more focus on hard, misclassiﬁed examples
            size_average(bool): By default, the losses are averaged over observations for each minibatch.
                                However, if the field size_average is set to False, the losses are
                                instead summed for each minibatch.


    """

    def __init__(self, class_num=3, alpha=None, gamma=2, size_average=True):
        super(FocalLoss, self).__init__()
        if alpha is None:
            self.alpha = Variable(torch.ones(class_num, 1))
        else:
            if isinstance(alpha, Variable):
                self.alpha = alpha
            else:
                self.alpha = Variable(alpha)
        self.gamma = gamma
        self.class_num = class_num
        self.size_average = size_average

    def forward(self, inputs, targets):
        N = inputs.size(0)
        C = inputs.size(1)
        P = F.softmax(inputs)

        class_mask = inputs.data.new(N, C).fill_(0)
        class_mask = Variable(class_mask)
        ids = targets.view(-1, 1)
        class_mask.scatter_(1, ids.data, 1.)
        #print(class_mask)

        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        alpha = self.alpha[ids.data.view(-1)]

        probs = (P * class_mask).sum(1).view(-1, 1)

        log_p = probs.log()
        #print('probs size= {}'.format(probs.size()))
        #print(probs)

        batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
        #print('-----bacth_loss------')
        #print(batch_loss)

        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
        return loss

In [None]:
## 加入对抗训练
class FGM:
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='bert.embedding.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='bert.embedding.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = []

In [None]:
from transformers import BertModel, BertPreTrainedModel


class Roberta_FocalLoss(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext', config=config)
        self.attention = nn.MultiheadAttention(embed_dim=config.hidden_size,
                                               num_heads=config.num_heads, batch_first=True)
        self.GRU = nn.GRU(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, bidirectional=True)
        self.classifier = nn.Linear(in_features=config.hidden_size * 2, out_features=config.num_classes)
        self.softmax = nn.Softmax(dim=1)
        self.loss_function = FocalLoss(alpha=config.alpha, class_num=sa_config.num_classes, gamma=5)
        # self.loss_function = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, classes=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        output, pooler_output = outputs[0], outputs[1]
        attention_output, _ = self.attention(output, output, output)
        gru_output, _ = self.GRU(attention_output)
        concat_output = torch.cat((gru_output[:, -1, :], pooler_output), dim=1)
        logits = self.classifier(concat_output)
        logits = self.softmax(logits)
        output = (logits,)
        if classes is not None:
            loss = self.loss_function(logits, classes)
            output = (loss,)
        return output

In [None]:
from torch import nn
from tqdm import tqdm


def SA_train_epoch(train_loader, model, optimizer, scheduler, epoch, config):
    # set model to training mode
    model.train()
    fgm = FGM(model)
    # step number in one epoch: 336
    train_losses = 0
    for _, batch_samples in enumerate(tqdm(train_loader)):
        batch_input, batch_masks, batch_classes = batch_samples
        batch_input, batch_masks, batch_classes = batch_input.to(
            config.device), batch_masks.to(config.device), batch_classes.to(
            config.device)
        # compute model output and loss
        loss = \
            model(input_ids=batch_input, attention_mask=batch_masks, classes=batch_classes)[0]
        train_losses += loss.item()
        # clear previous gradients, compute gradients of all variables wrt loss
        model.zero_grad()
        loss.backward()
        fgm.attack()
        loss_adv = model(input_ids=batch_input, attention_mask=batch_masks, classes=batch_classes,
                         )[0]
        loss_adv.backward()
        fgm.restore()
        # gradient clipping
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip_grad)
        # performs updates using calculated gradients
        optimizer.step()
        scheduler.step()
    train_loss = float(train_losses) / len(train_loader)
    print("Epoch: {}, train loss: {}".format(epoch, train_loss))


def SA_train(train_loader, eval_dataloader, model, optimizer, scheduler, config):
    """train the model and test model performance"""
    # reload weights from restore_dir if specified
    best_val_acc = 0.0
    patience_counter = 0
    # start training
    for epoch in range(1, config.epoch_num + 1):
        SA_train_epoch(train_loader, model, optimizer, scheduler, epoch, config)
        val_acc = evaluate(model, eval_dataloader)
        print("Epoch: {}, acc score: {}".format(epoch, val_acc))
        improve_acc = val_acc - best_val_acc
        if improve_acc > 1e-5:
            best_val_acc = val_acc
            if improve_acc < config.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1
        # Early stopping and logging best f1
        if (patience_counter >= config.patience_num and epoch > config.min_epoch_num) or epoch == config.epoch_num:
            print("Best val acc: {}".format(best_val_acc))
            break
    print("Training Finished!")

In [None]:
import numpy as np


# 计算模型的f1 score。
def class_acc(preds, labels):  # preds.shape=(16, 2) labels.shape=torch.Size([16, 1])
    # eq里面的两个参数的shape=torch.Size([16])
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    acc = correct.sum().item() / len(correct)
    return acc


def predict(model, inputs, masks):
    model.eval()
    inputs, masks = inputs.to(sa_config.device), masks.to(sa_config.device)
    batch_output = model(input_ids=inputs, attention_mask=masks, classes=None)[0]
    # batch_output = torch.max(batch_output, dim=1).indices.to('cpu')
    return batch_output


def evaluate(model, dataloader):
    avg_acc = []
    # 表示进入测试模式
    model.eval()
    device = sa_config.device
    with torch.no_grad():
        for batch in tqdm(dataloader):
            # 从batch中取数据，并放到GPU中
            b_input_ids, b_input_mask, b_classes = batch[0].long().to(device), batch[1].long().to(device), batch[
                2].long().to(device)
            # 前向传播，得到output
            output = model(b_input_ids, attention_mask=b_input_mask, classes=None)[0]
            # 统计当前batch的acc
            acc = class_acc(output, b_classes)
            avg_acc.append(acc)
    # 统计平均acc
    avg_acc = np.array(avg_acc).mean()
    print("平均acc:{}".format(avg_acc))
    return avg_acc

In [None]:
def linear_combination(x, y, epsilon):
    return epsilon * x + (1 - epsilon) * y


def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss


class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, epsilon: float = 0.1, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction

    def forward(self, preds, target):
        n = preds.size()[-1]
        log_preds = F.log_softmax(preds, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return linear_combination(loss / n, nll, self.epsilon)

In [None]:
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification
from torch.optim import AdamW
from torch import nn
from sklearn.model_selection import StratifiedKFold
from SA_config import *

model1 = Roberta_FocalLoss(config=sa_config)
model2 = BertForSequenceClassification(config=sa_config)
model1.to(sa_config.device)
model2.to(sa_config.device)
models = [model1, model2]
optimizer = AdamW(model1.parameters(), lr=sa_config.learning_rate)
SA_kfold = StratifiedKFold(n_splits=5).split(train_input_tokens, train_data_class)

In [None]:
for train_index, eval_index in SA_kfold:
    train_data = TensorDataset(train_input_tokens[train_index], train_attention_tokens[train_index],
                               train_data_class[train_index])
    eval_data = TensorDataset(train_input_tokens[eval_index], train_attention_tokens[eval_index],
                              train_data_class[eval_index])
    train_dataloader = SA_dataloader(train_data, True)
    eval_dataloader = SA_dataloader(eval_data, False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=sa_config.epoch_num * len(train_dataloader))
    for model in models:
        SA_train(train_dataloader, eval_dataloader, model, optimizer, scheduler, sa_config)

In [None]:

result = pd.DataFrame(columns=['id', 'class'])
result['id'] = test_dataframe.index
for index, data in enumerate(tqdm(test_dataloader)):
    tokens, masks = data
    # pred = torch.zeros((sa_config.batch_size,sa_config.num)
    # for model in models:
    output = predict(model1, tokens, masks)
    for b in range(len(output)):
        result.loc[index * sa_config.batch_size + b, 'class'] = output[b].numpy()

In [None]:
result.to_csv('./SA.csv')

In [None]:
sum(result['class'])