In [20]:
from transformers import AutoTokenizer,BertForSequenceClassification,TrainingArguments,Trainer,get_linear_schedule_with_warmup
from torch.utils.data import Dataset,TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
import re
import time
import random
#设置gpu 没有就是cpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
#固定所有的随机数的种子
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
#该句不需要gpu就可以运行
torch.cuda.manual_seed_all(seed_val)

cpu


In [21]:
#Tokenizer和Bert加载
#请从huggingface下载对应的模型，并保存在同目录的bert-base-chinese文件夹下，如果你能联网自动下载当我没说
tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese")
bert = BertForSequenceClassification.from_pretrained(
    "./bert-base-chinese",
    num_labels=1,
    #模型是否返回Attention的权重（score？）
    output_attentions=False,
    #模型是否返回全部隐藏层的状态
    output_hidden_states = False)
bert = bert.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


数据基础处理及切分

In [22]:
with open("./ChnSentiCorp.txt",'r',encoding='utf8') as f:
    data = f.readlines()
label = []
X = []
for line in data:
    label_line = int(line.split(',')[0])
    x_line = line[2:]
    x_line = re.sub('\s','',x_line)
    X.append(x_line)
    label.append(label_line)
#print(len(label),len(X))
#将所有的文本转化为id 和获取对应的掩码 
X_ = tokenizer(X,padding=True,truncation=True,max_length = 64)
X_ids = X_['input_ids']
X_mask = X_['attention_mask']
X_ids = torch.LongTensor(X_ids)
X_mask = torch.LongTensor(X_mask)
label = torch.Tensor(label)
print(X_ids.shape,X_mask.shape,label.shape)
dataset = TensorDataset(X_ids,X_mask,label)
train_size = int(0.9*len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset,[train_size,val_size])
print("训练集大小：",train_size,"  测试集大小:",val_size)

torch.Size([399, 64]) torch.Size([399, 64]) torch.Size([399])
训练集大小： 359   测试集大小: 40


In [23]:
batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size=batch_size)
validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size)

In [24]:
#正确率计算函数
def flat_accuracy(preds, labels):
    pred_flat = np.where(preds > 0.5, 1, 0).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
epochs = 2
total_steps = len(train_dataloader) * epochs
optimizer = AdamW(bert.parameters(),lr=2e-5,eps=1e-8)
#一个非常方便的自动改变学习率的函数
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)
#不需要损失函数，模型的前向传播中里已经有了MSE损失函数，直接将所有数据丢进去就行了
#当标签数>1时会使用交叉熵
#criterion = nn.CrossEntropyLoss()
for i in range(epochs):
    start = time.time()
    total_train_loss = 0
    bert.train()
    for batch_i,batch in tqdm(enumerate(train_dataloader)):
        #训练数据加载到正确设备
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_label = batch[2].to(device)
        #重置所有的梯度
        bert.zero_grad()
        output = bert.forward(b_input_ids,attention_mask=b_input_mask,labels=b_label)
        #print(output)
        loss = output['loss']
        logits = output["logits"]
        total_train_loss += loss.item()
        loss.backward()
        #梯度裁剪 防止梯度爆炸
        nn.utils.clip_grad_norm_(bert.parameters(), 1.0)
        optimizer.step()
        #自动改变学习率
        scheduler.step()
    average_loss = total_train_loss / len(train_dataloader)
    print("训练***代数:{},耗时:{},平均损失:{}".format(i+1,time.time()-start,average_loss))
    #开始运行验证集
    bert.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    start = time.time()
    for batch in validation_dataloader:
        #测试数据加载到正确设备
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #验证时不需要计算梯度
        with torch.no_grad():
            output = bert.forward(b_input_ids,attention_mask=b_input_mask,labels=b_labels)
        loss = output['loss']
        logits = output["logits"]
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss/len(validation_dataloader)
    print("验证***代数:{},耗时:{},平均损失:{},正确率:{}".format(i+1,time.time()-start,avg_val_loss,avg_val_accuracy))
print("训练结束")    
        

12it [00:43,  3.61s/it]


训练***代数:1,耗时:43.33224415779114,平均损失:0.5440009993811449
验证***代数:1,耗时:1.5003137588500977,平均损失:0.10309580713510513,正确率:0.90625


12it [00:43,  3.65s/it]


训练***代数:2,耗时:43.845633029937744,平均损失:0.15290648924807707
验证***代数:2,耗时:1.4780478477478027,平均损失:0.07184994965791702,正确率:0.90625
训练结束
