## 1.导入需要的库

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pytorch_pretrained_bert
# coding: UTF-8
import torch
import time 
import torch.nn as nn
import torch.nn.functional as F 
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig, BertAdam
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
from torch.utils.data import Dataset, TensorDataset,RandomSampler,DataLoader,SequentialSampler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3
  Downloading boto3-1.26.44-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 KB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.30.0,>=1.29.44
  Downloading botocore-1.29.44-py3-none-any.whl (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB

##2.数据处理

In [None]:
# word_ids存放词语的id
# word_types中0、1区分不同句子
# word_masks为attention中的掩码，0表示padding
word_ids = []
word_types = []
word_masks = []
labels = []
pad_size = 50
# 句子类别共10种
sentence_types = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
len_dict = dict()

# data_path为数据集路径，bert_path为预训练模型权重路径,ckpt_path为模型路径
data_path = "/content/drive/MyDrive/data/cnews/"
bert_path = "/content/drive/MyDrive/chinese_roberta_wwm_ext_pytorch/"
ckpt_path = "/content/drive/MyDrive/data/ckpt/"
# 初始化分词器，使用预训练的vocab
tokenizer = BertTokenizer(vocab_file=bert_path + "vocab.txt")

BATCH_SIZE = 16
NUM_EPOCHS = 3
optimizer_lr = 2e-5
optimizer_warmup = 0.05

In [None]:
# 读取训练数据
with open(data_path + "cnews.test.txt", encoding='utf-8') as f:
    for i, item in tqdm(enumerate(f)):
        # 统计句子长度，存储在字典中
        if (len(item)-4) not in len_dict:
          len_dict[len(item)-4] = 1
        else:
          len_dict[len(item)-4] += 1
        # 句子类别为字符串第一个词语
        item_type = sentence_types.index(item[0:2])
        sentence = tokenizer.tokenize(item[4:-1])
        # 添加bert中句首和句尾标记
        tokens = ["[CLS]"] + sentence + ["[SEP]"]

        # 得到句子词语id，type和attention中的mask
        ids = tokenizer.convert_tokens_to_ids(tokens)
        types = [0] * (len(ids))
        masks = [1] * len(ids)
        # 与pad_size比较，进行切断或填补
        if len(ids) < pad_size:
            #不是词语的部分，type为1，mask为0，id为0
            types = types + [1] * (pad_size - len(ids))
            masks = masks + [0] * (pad_size - len(ids))
            ids = ids + [0] * (pad_size - len(ids))
        else:
            types = types[:pad_size]
            masks = masks[:pad_size]
            ids = ids[:pad_size]
        #列表中分别存放所有训练数据的id，type，mask和label
        word_ids.append(ids)
        word_types.append(types)
        word_masks.append(masks)
        assert len(ids) == len(masks) == len(types) == pad_size
        labels.append([int(item_type)])

10000it [01:15, 132.99it/s]


In [None]:
print(sorted(len_dict.items(), reverse=False))

[(13, 1), (15, 4), (16, 1), (18, 1), (20, 1), (23, 1), (24, 2), (25, 2), (26, 1), (28, 1), (29, 11), (30, 2), (32, 3), (33, 2), (35, 2), (36, 1), (40, 1), (41, 1), (43, 1), (45, 1), (46, 2), (47, 1), (49, 1), (50, 1), (51, 1), (52, 1), (54, 2), (55, 1), (59, 1), (60, 2), (61, 1), (62, 2), (64, 3), (65, 1), (66, 2), (67, 3), (68, 2), (69, 1), (70, 2), (71, 1), (72, 1), (73, 5), (74, 1), (75, 2), (76, 2), (77, 2), (78, 3), (79, 2), (80, 3), (81, 2), (82, 5), (83, 7), (84, 3), (85, 3), (86, 4), (87, 4), (88, 5), (89, 4), (90, 3), (91, 6), (92, 1), (93, 4), (94, 5), (95, 4), (96, 4), (97, 3), (98, 2), (99, 5), (100, 8), (101, 9), (102, 7), (103, 3), (104, 4), (105, 4), (106, 7), (107, 8), (108, 4), (109, 4), (110, 1), (111, 6), (112, 4), (113, 8), (114, 4), (115, 3), (116, 5), (117, 5), (118, 2), (119, 7), (120, 5), (121, 8), (122, 8), (123, 9), (124, 7), (125, 5), (126, 7), (127, 5), (128, 5), (129, 3), (130, 2), (131, 2), (132, 6), (133, 4), (134, 2), (135, 3), (136, 6), (137, 4), (138, 

In [None]:
# 生成随机数列表，打乱索引，随机划分数据集
random_order = list(range(len(word_ids)))
np.random.seed(2022)
np.random.shuffle(random_order)
print(random_order[:10])

[6487, 8785, 7390, 7078, 1230, 3684, 5263, 3533, 104, 8080]


In [None]:
# 按照4:1比例划分训练集和测试集
# 将数据集格式转换为np array格式
input_ids_train = np.array([word_ids[i] for i in random_order[:int(len(word_ids) * 0.8)]])
input_types_train = np.array([word_types[i] for i in random_order[:int(len(word_ids) * 0.8)]])
input_masks_train = np.array([word_masks[i] for i in random_order[:int(len(word_ids) * 0.8)]])
y_train = np.array([labels[i] for i in random_order[:int(len(word_ids) * 0.8)]])
print(input_ids_train.shape, input_types_train.shape, input_masks_train.shape, y_train.shape)

input_ids_test = np.array([word_ids[i] for i in random_order[int(len(word_ids) * 0.8):]])
input_types_test = np.array([word_types[i] for i in random_order[int(len(word_ids) * 0.8):]])
input_masks_test = np.array([word_masks[i] for i in random_order[int(len(word_ids) * 0.8):]])
y_test = np.array([labels[i] for i in random_order[int(len(word_ids) * 0.8):]])
print(input_ids_test.shape, input_types_test.shape, input_masks_test.shape, y_test.shape)

(8000, 50) (8000, 50) (8000, 50) (8000, 1)
(2000, 50) (2000, 50) (2000, 50) (2000, 1)


In [None]:
# 生成训练集与测试集的dataloader
train_data = TensorDataset(torch.LongTensor(input_ids_train), torch.LongTensor(input_types_train),
                           torch.LongTensor(input_masks_train), torch.LongTensor(y_train))
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(torch.LongTensor(input_ids_test),torch.LongTensor(input_types_test),
                          torch.LongTensor(input_masks_test), torch.LongTensor(y_test))
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

## 3.定义模型

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # 加载预训练的bert模型权重
        self.bert = BertModel.from_pretrained(bert_path) 
        # 对于每个参数均求梯度
        for param in self.bert.parameters():
            param.requires_grad = True 
        # 线性层求10个类别的概率    
        self.fc = nn.Linear(768, 10)  

    def forward(self, x):
        sentence = x[0]  
        types = x[1]
        mask = x[2]  
        _, pooled = self.bert(sentence, token_type_ids=types, 
                              attention_mask=mask, output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

In [None]:
# 若有可用设备则使用cuda进行计算，否则cpu计算
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
model = Model().to(DEVICE)
print(DEVICE)
print(model)

cpu
Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
         

## 4.定义优化器

In [None]:
# 生成参数名列表和不进行权重衰减的列表
param_optimizer = list(model.named_parameters())  
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

# 使用BertAdam优化器
optimizer = BertAdam(optimizer_grouped_parameters, lr=optimizer_lr, warmup=optimizer_warmup, t_total=len(train_loader) * NUM_EPOCHS)

## 5.训练与测试

In [None]:
# 训练模型
def train(model, device, train_loader, optimizer, epoch):  
    model.train()
    # 计算训练时间
    start_time = time.time()
    # 加载训练数据
    for batch_idx, (sentence, types, masks, item_type) in enumerate(train_loader):
        # 将训练数据转移到计算的设备上
        sentence, types, masks, item_type = sentence.to(device), types.to(device), masks.to(device), item_type.to(device)
        # 生成预测结果
        y_pred = model([sentence, types, masks])  
        # 梯度清零，计算损失并进行误差反向传播
        model.zero_grad()  
        loss = F.cross_entropy(y_pred, item_type.squeeze()) 
        loss.backward()
        optimizer.step()
        # 打印训练信息
        if (batch_idx + 1) % 100 == 0:  
            current_time = time.time()
            print('Train Epoch: {} [{}/{} ({:.2f}%)]tLoss: {:.6f}  Time:{:.2f} s'.format(epoch,
                                           (batch_idx + 1) * len(sentence), len(train_loader.dataset),
                                           100. * batch_idx / len(train_loader), loss.item(),
                                           (current_time-start_time))) 

# 测试模型
def test(model, device, test_loader):  
    model.eval()
    test_loss = 0.0
    acc = 0
    # 加载测试数据
    for batch_idx, (sentence, types, masks, item_type) in enumerate(test_loader):
        sentence, types, masks, item_type = sentence.to(device), types.to(device), masks.to(device), item_type.to(device)
        # 测试过程中不计算梯度
        with torch.no_grad():
            y_pred = model([sentence, types, masks])
        # 得到预测的句子类别
        pred = y_pred.max(-1, keepdim=True)[1] 
        # 计算测试损失、准确率
        test_loss += F.cross_entropy(y_pred, item_type.squeeze())
        acc += pred.eq(item_type.view_as(pred)).sum().item() 
    test_loss /= len(test_loader)
    # 打印测试信息
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(test_loss, acc, 
                                        len(test_loader.dataset), 100. * acc / len(test_loader.dataset)))
    return acc / len(test_loader.dataset)

In [None]:
best_acc = 0.982
# 训练模型，保存正确率最高的模型
for epoch in range(1, NUM_EPOCHS + 1):  # 3个epoch
    train(model, DEVICE, train_loader, optimizer, epoch)
    acc = test(model, DEVICE, test_loader)
    if best_acc < acc:
        best_acc = acc
        # 保存模型，记录模型正确率
        torch.save(model.state_dict(), ckpt_path+'roberta_model_' + str(round(acc, 3)) + '.pth') 
    # 打印训练信息
    print("Train Epoch: {} acc is: {:.4f}, best acc is {:.4f}n".format(epoch, acc, best_acc))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1420.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Test set: Average loss: 0.0753, Accuracy: 1954/2000 (97.70%)
Train Epoch: 1 acc is: 0.9770, best acc is 0.9820n


KeyboardInterrupt: ignored

In [None]:
# 预测句子类别
def predict_type(sentence, path):
    # 加载最优的模型
    model.load_state_dict(torch.load(path))
    
    sentence = tokenizer.tokenize(sentence)
    tokens = ["[CLS]"] + sentence + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokens)
    types = [0] * (len(ids))
    masks = [1] * len(ids)
    # 与pad_size比较，进行切断或填补
    if len(ids) < pad_size:
        types = types + [1] * (pad_size - len(ids))  
        masks = masks + [0] * (pad_size - len(ids))
        ids = ids + [0] * (pad_size - len(ids))
    else:
        types = types[:pad_size]
        masks = masks[:pad_size]
        ids = ids[:pad_size]

    ids, types, masks = torch.LongTensor(np.array(ids)), torch.LongTensor(np.array(types)), torch.LongTensor(
        np.array(masks))

    y_pred = model([ids.reshape(1, -1), types.reshape(1, -1), masks.reshape(1, -1)])

    return sentence_types[torch.argmax(y_pred)]

In [None]:
types=predict_type("人民币汇率降低", "/content/drive/MyDrive/data/ckpt/roberta_model.pth")
print(types)
types=predict_type("梅西获得卡塔尔世界杯金球奖", "/content/drive/MyDrive/data/ckpt/roberta_model.pth")
print(types)

时政
体育
