reference: https://github.com/649453932/Bert-Chinese-Text-Classification-Pytorch

# Data preprocess

In [27]:
import pandas as pd
import os
os.getcwd()

'/Users/vivianruan/Downloads'

In [49]:
df = pd.read_csv('train_2.csv')
df 

Unnamed: 0,content_id,content,subject,sentiment_value,sentiment_word
0,13149,因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。,价格,0,影响
1,2288,四驱价格貌似挺高的，高的可以看齐XC60了，看实车前脸有点违和感。不过大众的车应该不会差。,价格,-1,高
2,1652,斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...,价格,1,低
3,8865,这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了,价格,-1,有钱任性
4,11784,17价格忒高，估计也就是14-15左右。,价格,-1,高
...,...,...,...,...,...
12567,17392,全时四驱仅比一般SUV车强一点，肯定干不过Q5，XC60，连A4也干不过。水平对置仅体现在低...,动力,0,
12568,9780,哈哈，终于看到有人开始厌烦前置雷达的声音了，这个亲，那个声音来自哪里？,配置,-1,
12569,1079,请教一下，变速箱油，差速器油，火花塞，分别多久更换。,动力,0,
12570,16766,求购二手１４款ＸＴ的后刹车总成。（已网购到手了）,安全性,0,


In [53]:
label = {
    "动力": 0,
    "价格": 1,
    "油耗": 2,
    "操控": 3,
    "舒适性": 4,
    "配置": 5,
    "安全性": 6,
    "内饰": 7,
    "外观": 8,
    "空间": 9
}
df = df.replace({"subject":label})

In [54]:
df['subject'].value_counts()

0    3454
1    1634
2    1379
3    1302
4    1182
5    1075
6     736
7     669
8     606
9     535
Name: subject, dtype: int64

In [63]:
df_new = df.copy()
df_new = df_new[["content","subject"]]
df_new

Unnamed: 0,content,subject
0,因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。,1
1,四驱价格貌似挺高的，高的可以看齐XC60了，看实车前脸有点违和感。不过大众的车应该不会差。,1
2,斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...,1
3,这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了,1
4,17价格忒高，估计也就是14-15左右。,1
...,...,...
12567,全时四驱仅比一般SUV车强一点，肯定干不过Q5，XC60，连A4也干不过。水平对置仅体现在低...,0
12568,哈哈，终于看到有人开始厌烦前置雷达的声音了，这个亲，那个声音来自哪里？,5
12569,请教一下，变速箱油，差速器油，火花塞，分别多久更换。,0
12570,求购二手１４款ＸＴ的后刹车总成。（已网购到手了）,6


In [101]:
import random
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(1)
df_train, df_val, df_test = np.split(df_new.sample(frac=1, random_state=111),
                                     [int(.6*len(df_new)), int(.8*len(df_new))])

print(len(df_train),len(df_val), len(df_test))

7543 2514 2515


In [81]:
df_train.to_csv('/Users/vivianruan/Downloads/THUCNews/data_2/train.txt', 
                header=None, index=None, sep='\t', mode='a')
df_val.to_csv('/Users/vivianruan/Downloads/THUCNews/data_2/test.txt', 
                header=None, index=None, sep='\t', mode='a')
df_test.to_csv('/Users/vivianruan/Downloads/THUCNews/data_2/dev.txt', 
                header=None, index=None, sep='\t', mode='a')

In [8]:
import torch
#import pytorch_pretrained

In [6]:
! pip install pytorch-pretrained-bert



# utils

In [9]:
import torch
from tqdm import tqdm
import time
from datetime import timedelta

PAD, CLS = '[PAD]', '[CLS]'  # padding符号, bert中综合信息符号


def build_dataset(config):

    def load_dataset(path, pad_size=32): 
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
                token = config.tokenizer.tokenize(content)
                token = [CLS] + token
                seq_len = len(token)
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)

                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids, int(label), seq_len, mask))
        return contents
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test


class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

# BERT model

In [10]:
import torch
import torch.nn as nn
# from pytorch_pretrained_bert import BertModel, BertTokenizer
from pytorch_pretrained_bert import BertModel, BertTokenizer


class Config(object):

    """配置参数"""
    def __init__(self, dataset):
        self.model_name = 'bert'
        self.train_path = dataset + '/data_2/train.txt'                                # 训练集
        self.dev_path = dataset + '/data_2/dev.txt'                                    # 验证集
        self.test_path = dataset + '/data_2/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data_2/class.txt').readlines()]                                # 类别名单
        self.save_path = dataset + '/saved_dict_2/' + self.model_name + '.ckpt'        # 模型训练结果
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.num_epochs = 3                                             # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5                                       # 学习率
        self.bert_path = './bert_pretrain'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768


class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[2]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
        _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

# train eval

In [11]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from pytorch_pretrained_bert import BertAdam


# 权重初始化，默认xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if len(w.size()) < 2:
                continue
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass


def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

# Run

In [104]:
import time
import torch
import numpy as np
from importlib import import_module
import argparse

#parser = argparse.ArgumentParser(description='Chinese Text Classification')
#parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE')
#args = parser.parse_args()

# args={"model": "Bert"}


if __name__ == '__main__':
    dataset = 'THUCNews'  # 数据集

    #model_name = args.model  # bert
    #x = import_module('models.' + model_name)
    config = Config(dataset)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    train_data, dev_data, test_data = build_dataset(config)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = Model(config).to(config.device)
    train(config, model, train_iter, dev_iter, test_iter)

Loading data...


15086it [00:01, 7575.46it/s]
2515it [00:00, 7578.83it/s]
2514it [00:00, 7614.75it/s]


Time usage: 0:00:03
Epoch [1/3]
Iter:      0,  Train Loss:   2.3,  Train Acc: 12.50%,  Val Loss:   2.3,  Val Acc:  8.19%,  Time: 0:00:32 *
Iter:    100,  Train Loss:  0.81,  Train Acc: 78.12%,  Val Loss:  0.98,  Val Acc: 69.30%,  Time: 0:14:29 *
Epoch [2/3]
Iter:    200,  Train Loss:  0.83,  Train Acc: 69.53%,  Val Loss:   1.0,  Val Acc: 67.99%,  Time: 0:26:55 
Epoch [3/3]
Iter:    300,  Train Loss:  0.53,  Train Acc: 81.25%,  Val Loss:   1.1,  Val Acc: 65.45%,  Time: 0:39:05 
Test Loss:  0.99,  Test Acc: 70.05%
Precision, Recall and F1-Score...
              precision    recall  f1-score   support

          动力     0.7457    0.7599    0.7527       683
          价格     0.7861    0.7982    0.7921       327
          油耗     0.7389    0.7918    0.7644       293
          操控     0.7338    0.3864    0.5062       264
         舒适性     0.5805    0.6982    0.6339       222
          配置     0.6873    0.7807    0.7310       228
         安全性     0.5660    0.6977    0.6250       129
          内饰   