## 1 导入需要的库

In [20]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 避免Warning

## 2 加载数据集

In [21]:
# 加载mrpc数据集
dataset = load_dataset('glue', 'mrpc')

Found cached dataset glue (/home/yunchang/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
df_train = dataset['train'].to_pandas()
df_val = dataset['test'].to_pandas()
df_test = dataset['validation'].to_pandas()
df_train.head()

Unnamed: 0,sentence1,sentence2,label,idx
0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr...",1,0
1,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...,0,1
2,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ...",1,2
3,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set...",0,3
4,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,1,4


标准数据集一般无需数据预处理

## 3 数据集定义

In [23]:
class CustomDataset(Dataset):
    def __init__(self, data, max_len, with_labels=True, bert_model='albert-base-v2'):
        # super().__init__()
        self.data = data
        self.max_len = max_len
        self.with_labels = with_labels  # 区分训练集和测试集
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  # 这一步很费时，应该创建对象时就加载

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence1 = self.data.loc[index, 'sentence1']
        sentence2 = self.data.loc[index, 'sentence2']

        encodings = self.tokenizer(
            sentence1, sentence2,  # tokenizer可以接受1或2个序列，这里输入两个
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encodings['input_ids'].squeeze(0)  # squeeze(0)表示去除第一个维度。由于输出是二维tensor，[[ ]]，第一维被去除
        attention_mask = encodings['attention_mask'].squeeze(0)
        token_type_ids = encodings['token_type_ids'].squeeze(0)
        
        if self.with_labels:  # 若有标签，即训练集
            label = self.data.loc[index, 'label']
            return input_ids, attention_mask, token_type_ids, label
        else:  # 测试集 
            return input_ids, attention_mask, token_type_ids

## 4 模型定义

In [24]:
class SentencePairClassifier(nn.Module):
    def __init__(self, bert_model='albert-base-v2', freeze_bert=True):
        super().__init__()
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  encoder 隐藏层大小
        if bert_model == "albert-base-v2":  # 12M 参数
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M 参数
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M 参数
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M 参数
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M 参数
            hidden_size = 768
        elif bert_model == "roberta-base": # 
            hidden_size = 768

        if freeze_bert:  # 固定Bert层 更新分类输出层
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_size, 1)
    
    
    @autocast()  # 混合精度训练
    def forward(self, input_ids, attention_mask, token_type_ids):
        # BERT的输出: last_hidden_state, pooler_output, (hidden_states)可选, (attentions)可选
        # last_hidden_state维度：(batch_size, sequence_length, hidden_size), 
        # pooler_output维度：(batch_size, hidden_size) [CLS]的最后一层的隐藏状态
        outputs = self.bert_layer(input_ids, attention_mask, token_type_ids)
        logits = self.linear(self.dropout(outputs['pooler_output']))

        return logits


In [25]:
def set_seed(seed):
    """ 固定随机种子，保证结果复现
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def evaluate_loss(net, device, loss_fun, dataloader):
    """
    评估输出
    """
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (input_ids, attention_mask, token_type_ids, label) in enumerate(tqdm(dataloader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)

            logits = net(input_ids, attention_mask, token_type_ids)
            mean_loss += loss_fun(logits.squeeze(-1), label.float()).item()  # squeeze(-1)去除最后维度值为1的维度
            count += 1

    return mean_loss / count            

In [26]:
def train_bert(net, loss_fun, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate, device):
    
    best_loss = np.Inf  # 最小损失
    best_epoch = 1         # 损失最小的epoch序列
    nb_iterations = len(train_loader)  # 一个epoch有多少次迭代
    print_every = nb_iterations // 5  # 打印频率
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()  # ????????????
    
    for epoch in range(epochs):
        net.train()
        running_loss = 0.0
        for it, (input_ids, attention_mask, token_type_ids, label) in enumerate(tqdm(train_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)

            # 混合精度加速训练 
            with autocast(): # ?????
                logits = net(input_ids, attention_mask, token_type_ids)
                loss = loss_fun(logits.squeeze(-1), label.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged  ??????????????
            
            scaler.scale(loss).backward()  # ????????????

            # 以下？？？
            if (it + 1) % iters_to_accumulate == 0:  # iters_to_accumulate次迭代 更新一次梯度
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # 根据迭代次数调整学习率。
                lr_scheduler.step()
                # 梯度清零
                opti.zero_grad()


            running_loss += loss.item()


            if (it + 1) % print_every == 0:  # 打印训练损失
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                    .format(it+1, nb_iterations, epoch+1, running_loss / print_every))

                running_loss = 0.0

        val_loss = evaluate_loss(net, device, loss_fun, val_loader)
        print()
        print("Epoch {} complete! Validation Loss : {}".format(epoch+1, val_loss))

    #     if val_loss < best_loss:
    #         print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
    #         print()
    #         net_copy = copy.deepcopy(net)  # 保存最优模型
    #         best_loss = val_loss
    #         best_ep = ep + 1

    # # 保存模型
    # path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    # torch.save(net_copy.state_dict(), path_to_model)
    # print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache() # 清空显存

## 5 超参数设置

In [27]:
bert_model = "albert-base-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
freeze_bert = False  # 是否冻结Bert
maxlen = 128  # 最大长度
bs = 16  # batch size
iters_to_accumulate = 2  # 梯度累加
lr = 2e-5  # learning rate
epochs = 4  # 训练轮数

## 6 训练与评估

In [28]:
#  固定随机种子 便于复现
set_seed(1102)

# 创建训练集与验证集
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

# 常见训练集与验证集DataLoader
train_loader = DataLoader(train_set, batch_size=bs, num_workers=0)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

# if torch.cuda.device_count() > 1:  # if multiple GPUs
#     print("Let's use", torch.cuda.device_count(), "GPUs!")
#     net = nn.DataParallel(net)

net.to(device)

loss_fun = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, loss_fun, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate, device)


Reading training data...
Reading validation data...


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 21%|██        | 48/230 [00:04<00:15, 11.85it/s]


Iteration 46/230 of epoch 1 complete. Loss : 0.30363395648158115 


 41%|████      | 94/230 [00:07<00:11, 11.95it/s]


Iteration 92/230 of epoch 1 complete. Loss : 0.2953830973609634 


 61%|██████    | 140/230 [00:11<00:07, 12.03it/s]


Iteration 138/230 of epoch 1 complete. Loss : 0.2616934980387273 


 81%|████████  | 186/230 [00:15<00:03, 12.15it/s]


Iteration 184/230 of epoch 1 complete. Loss : 0.2200546169086643 


100%|██████████| 230/230 [00:19<00:00, 12.07it/s]



Iteration 230/230 of epoch 1 complete. Loss : 0.1842898645478746 


100%|██████████| 108/108 [00:03<00:00, 28.37it/s]



Epoch 1 complete! Validation Loss : 0.3846226919580389


 21%|██        | 48/230 [00:03<00:15, 12.01it/s]


Iteration 46/230 of epoch 2 complete. Loss : 0.20125033997971079 


 41%|████      | 94/230 [00:07<00:11, 12.05it/s]


Iteration 92/230 of epoch 2 complete. Loss : 0.18559055305693462 


 61%|██████    | 140/230 [00:11<00:07, 12.01it/s]


Iteration 138/230 of epoch 2 complete. Loss : 0.16517505827157394 


 81%|████████  | 186/230 [00:15<00:03, 11.89it/s]


Iteration 184/230 of epoch 2 complete. Loss : 0.1281280247899501 


100%|██████████| 230/230 [00:19<00:00, 12.08it/s]



Iteration 230/230 of epoch 2 complete. Loss : 0.09792480261429497 


100%|██████████| 108/108 [00:03<00:00, 29.55it/s]



Epoch 2 complete! Validation Loss : 0.34365476502312553


 21%|██        | 48/230 [00:03<00:14, 12.50it/s]


Iteration 46/230 of epoch 3 complete. Loss : 0.1172106536510198 


 41%|████      | 94/230 [00:07<00:10, 12.49it/s]


Iteration 92/230 of epoch 3 complete. Loss : 0.10224637619989074 


 61%|██████    | 140/230 [00:11<00:07, 12.41it/s]


Iteration 138/230 of epoch 3 complete. Loss : 0.09250902991903864 


 81%|████████  | 186/230 [00:14<00:03, 12.42it/s]


Iteration 184/230 of epoch 3 complete. Loss : 0.07318819148223037 


100%|██████████| 230/230 [00:18<00:00, 12.48it/s]



Iteration 230/230 of epoch 3 complete. Loss : 0.03886690130457282 


100%|██████████| 108/108 [00:03<00:00, 29.54it/s]



Epoch 3 complete! Validation Loss : 0.39413672244107284


 21%|██        | 48/230 [00:03<00:14, 12.46it/s]


Iteration 46/230 of epoch 4 complete. Loss : 0.058210089944465006 


 41%|████      | 94/230 [00:07<00:10, 12.43it/s]


Iteration 92/230 of epoch 4 complete. Loss : 0.058808597857537476 


 61%|██████    | 140/230 [00:11<00:07, 12.38it/s]


Iteration 138/230 of epoch 4 complete. Loss : 0.0464486267377177 


 81%|████████  | 186/230 [00:14<00:03, 12.35it/s]


Iteration 184/230 of epoch 4 complete. Loss : 0.040330378682879 


100%|██████████| 230/230 [00:18<00:00, 12.42it/s]



Iteration 230/230 of epoch 4 complete. Loss : 0.02078486543715648 


100%|██████████| 108/108 [00:03<00:00, 29.36it/s]


Epoch 4 complete! Validation Loss : 0.39935404448597517



