In [2]:
import os
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"   
MODEL_PATH = "C:/Users/11040/Desktop/weibonlp-master/sentiments/chinese_wwm_pytorch"     

In [3]:
# 加载
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH,model_max_length=512)   # 分词器
bert = BertModel.from_pretrained(MODEL_PATH)       #模型

Some weights of the model checkpoint at C:/Users/11040/Desktop/weibonlp-master/sentiments/chinese_wwm_pytorch were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [5]:
# 超参数
learning_rate = 1e-3
input_size = 768
num_epoches = 10
batch_size = 32
decay_rate = 0.9

In [6]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = df["sentences"].tolist()
        self.label = df["label"].tolist()

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size,shuffle=True,drop_last=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size,shuffle=True,drop_last=True)


# 网络结构
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out

net = Net(input_size).to(device)

In [7]:
from sklearn import metrics

# 测试集效果检验
def test():
    y_pred, y_true = [], []

    with torch.no_grad():
        for words, labels in test_loader:
            tokens = tokenizer(words, truncation=True,padding=True)
            input_ids = torch.tensor(tokens["input_ids"]).to(device)
            attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
            last_hidden_states = bert(input_ids, attention_mask=attention_mask)
            bert_output = last_hidden_states[0][:, 0]
            outputs = net(bert_output)          # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred)
    y_true = torch.cat(y_true)
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    print(metrics.classification_report(y_true, y_pred))
    print("准确率:", metrics.accuracy_score(y_true, y_pred))
    print("AUC:", metrics.roc_auc_score(y_true, y_prob) )

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)



In [8]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (words, labels) in enumerate(train_loader):
        tokens = tokenizer(words, truncation=True,padding=True)
        input_ids = torch.tensor(tokens["input_ids"]).to(device)
        attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
        labels = labels.float().to(device)
        with torch.no_grad():
            last_hidden_states = bert(input_ids, attention_mask=attention_mask)
            bert_output = last_hidden_states[0][:, 0]
        optimizer.zero_grad()               # 梯度清零
        outputs = net(bert_output)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        loss.backward()                     # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 10 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    
    # learning_rate decay
    scheduler.step()
    
    # test
    test()
    
    # save model
    model_path = "C:/Users/11040/Desktop/weibonlp-master/sentiments/model/bert_dnn_{}.model".format(epoch+1)
    torch.save(net, model_path)
    print("saved model: ", model_path)

epoch:1, step:10, loss:0.6797645092010498
epoch:1, step:20, loss:0.6257932782173157
epoch:1, step:30, loss:0.5806602835655212
epoch:1, step:40, loss:0.5842814445495605
epoch:1, step:50, loss:0.5907979011535645
epoch:1, step:60, loss:0.5796312689781189
epoch:1, step:70, loss:0.5663250088691711
              precision    recall  f1-score   support

           0       0.69      0.13      0.23        67
           1       0.68      0.97      0.80       125

    accuracy                           0.68       192
   macro avg       0.68      0.55      0.51       192
weighted avg       0.68      0.68      0.60       192

准确率: 0.6770833333333334
AUC: 0.7130149253731344
saved model:  C:/Users/11040/Desktop/weibonlp-master/sentiments/model/bert_dnn_1.model
epoch:2, step:10, loss:0.5675446391105652
epoch:2, step:20, loss:0.5446959733963013
epoch:2, step:30, loss:0.5258544683456421
epoch:2, step:40, loss:0.532708466053009
epoch:2, step:50, loss:0.5517528653144836
epoch:2, step:60, loss:0.5135912299

KeyboardInterrupt: 

In [9]:
net = torch.load("C:/Users/11040/Desktop/weibonlp-master/sentiments/model/bert_dnn_8.model") 

In [10]:
s = ["煎熬。粗暴煽情让人尴尬至极，故事讲得冗长稀碎。", "说真的，那些说不好看，抵制的，没拿钱�？我真不信！！！绝对的自来水"]
tokens = tokenizer(s, padding=True)
input_ids = torch.tensor(tokens["input_ids"])
attention_mask = torch.tensor(tokens["attention_mask"])
last_hidden_states = bert(input_ids, attention_mask=attention_mask)
bert_output = last_hidden_states[0][:, 0]
outputs = net(bert_output)
print(outputs)

tensor([[0.0700],
        [0.6129]], grad_fn=<SigmoidBackward0>)
