In [1]:
import random, time, os
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

# BERT模型分类

## 创建数据集

In [3]:
corpus_path = "../../H/datasets/THUCNews/"
train_path = corpus_path + "train.txt"
dev_path = corpus_path + "dev.txt"
test_path = corpus_path + "test.txt"
model_path = '../../H/models/huggingface/bert-base-chinese'

In [3]:
# 加载语料
def load_corpus(path):
    sentences = []
    labels = []
    with open(path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            line = line.strip()
            if not line:
                continue
            sent, label = line.split('\t')
            sentences.append(sent)
            labels.append(int(label))
    return sentences, labels


train_data, train_labels = load_corpus(train_path)

# 数据量
print(f"No of Training Data: {len(train_data)}")

# 多少类
print(f"No of different Labels: {len(set(train_labels))}")

# 训练样本
print(f"Sample Data: {train_data[900]}, Label: {train_labels[900] }")

180000it [00:00, 1396939.80it/s]

No of Training Data: 180000
No of different Labels: 10
Sample Data: 斯里兰卡急派外交部长前往巴基斯坦, Label: 6





In [41]:
label_names = open(corpus_path + 'class.txt', 'r', encoding='UTF-8').read()
print(label_names)

finance
realty
stocks
education
science
society
politics
sports
game
entertainment


> 语料的形式为：句子 + 标签；新闻句子及其对应的类别

In [4]:
# 分词器
bert_tokenizer = BertTokenizer.from_pretrained(model_path)

In [6]:
# 向量化
def vectorize(sentences, tokenizer):
    input_ids = []

    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,  # 添加特殊符号
        )
        input_ids.append(encoded_sent)
    return input_ids


input_ids = vectorize(train_data, tokenizer=bert_tokenizer)
input_ids[900]

[101,
 3172,
 7027,
 1065,
 1305,
 2593,
 3836,
 1912,
 769,
 6956,
 7270,
 1184,
 2518,
 2349,
 1825,
 3172,
 1788,
 102]

In [7]:
MAX_LEN = max([len(sen) for sen in input_ids])
print("Max sentence length: ", MAX_LEN)

Max sentence length:  35


In [8]:
# 填充成相同的长度
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Padding token: {:}, ID: {:}".format(bert_tokenizer.pad_token,
                                           bert_tokenizer.pad_token_id))
input_ids = pad_sequences(input_ids,
                          maxlen=MAX_LEN,
                          dtype='long',
                          value=0,
                          truncating="post",
                          padding='post')
input_ids.shape, input_ids[900]

Padding token: [PAD], ID: 0


((180000, 35),
 array([ 101, 3172, 7027, 1065, 1305, 2593, 3836, 1912,  769, 6956, 7270,
        1184, 2518, 2349, 1825, 3172, 1788,  102,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]))

In [9]:
# 对应的掩码
# 填充对应的mask


def create_mask(input_ids):
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks


mask = create_mask(input_ids)
print(mask[900])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
# 转化为 PyTorch 数据格式
import torch

train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(train_labels)
train_mask = torch.tensor(mask)
train_inputs.shape, train_labels.shape, train_mask.shape

(torch.Size([180000, 35]), torch.Size([180000]), torch.Size([180000, 35]))

In [12]:
# 创建数据管道

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_data = TensorDataset(train_inputs, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size)
len(train_dataloader)

5625

In [17]:
# 将整个过程整合起来
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


class DataGen:
    def __init__(self, corpus_path, tokenizer, num_samples=None):
        self.tokenizer = tokenizer
        corpus_path = os.path.abspath(corpus_path)
        saved_path = corpus_path.split('.')[0]  # 用于保存向量化后的语料
        if not os.path.exists(saved_path + '.npz'):  # numpy 的保存格式

            train_inputs, train_labels = self.load_corpus(corpus_path)
            self.train_labels = train_labels

            train_inputs = self.vectorize(train_inputs, self.tokenizer)
            MAX_LEN = max([len(seq) for seq in train_inputs])

            train_inputs = pad_sequences(
                train_inputs,
                maxlen=MAX_LEN,
                dtype='long',
                value=0,
                truncating="post",
                padding='post',
            )
            train_mask = self.create_mask(train_inputs)
            print(saved_path)
            np.savez(saved_path,
                     train_inputs=train_inputs,
                     train_mask=train_mask,
                     train_labels=train_labels)
            print('Done')

        else:
            npzfile = np.load(saved_path + '.npz', allow_pickle=True)
            train_inputs = npzfile['train_inputs']
            train_labels = npzfile['train_labels']
            train_mask = npzfile['train_mask']

        if num_samples:
            train_inputs = train_inputs[:num_samples]
            train_labels = train_labels[:num_samples]
            train_mask = train_mask[:num_samples]

        self.train_inputs = torch.tensor(train_inputs).long()
        self.train_labels = torch.tensor(train_labels).long()
        self.train_mask = torch.tensor(train_mask).long()

    def __call__(self, batch_size):
        train_data = TensorDataset(
            self.train_inputs,
            self.train_mask,
            self.train_labels,
        )
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=batch_size,
        )
        return train_dataloader

    def load_corpus(self, path):
        sentences = []
        labels = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                line = line.strip()
                if not line:
                    continue
                sent, label = line.split('\t')
                sentences.append(sent)
                labels.append(int(label))
        return sentences, labels

    def vectorize(self, sentences, tokenizer):
        input_ids = []

        for sent in sentences:
            encoded_sent = tokenizer.encode(
                sent,
                add_special_tokens=True,  # 添加特殊符号
            )
            input_ids.append(encoded_sent)
        return input_ids

    def create_mask(self, input_ids):
        attention_masks = []
        for sent in input_ids:
            att_mask = [int(token_id > 0) for token_id in sent]
            attention_masks.append(att_mask)
        return attention_masks

In [6]:
%%time
train_data_gen = DataGen(train_path, tokenizer=bert_tokenizer)
train_dataloader = train_data_gen(batch_size=32)
len(train_dataloader)

180000it [00:00, 1371716.12it/s]


/home/yangbin7/H/datasets/THUCNews/train
Done
CPU times: user 21.6 s, sys: 139 ms, total: 21.7 s
Wall time: 20.8 s


5625

In [7]:
%%time
validation_data_gen = DataGen(dev_path, tokenizer=bert_tokenizer)
validation_dataloader = validation_data_gen(batch_size=32)
len(validation_dataloader)

10000it [00:00, 987359.70it/s]


/home/yangbin7/H/datasets/THUCNews/dev
Done
CPU times: user 2.32 s, sys: 4.05 ms, total: 2.32 s
Wall time: 1.14 s


313

In [8]:
%%time
test_data_gen = DataGen(test_path, tokenizer=bert_tokenizer)
test_dataloader = test_data_gen(batch_size=32)
len(test_dataloader)

10000it [00:00, 1005442.52it/s]


/home/yangbin7/H/datasets/THUCNews/test
Done
CPU times: user 2.45 s, sys: 19.8 ms, total: 2.47 s
Wall time: 1.17 s


313

In [17]:
for data in train_dataloader:
    input_data, input_mask, input_labels = data
    print(input_data)
    print(input_mask)
    print(input_labels)
    break

tensor([[ 101, 3719, 1164,  ...,    0,    0,    0],
        [ 101, 6205, 7305,  ...,    0,    0,    0],
        [ 101, 7357, 3236,  ...,    0,    0,    0],
        ...,
        [ 101,  772,  689,  ...,    0,    0,    0],
        [ 101, 5792, 7674,  ...,    0,    0,    0],
        [ 101,  782,  924,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([2, 4, 9, 6, 5, 5, 3, 3, 7, 3, 0, 6, 1, 6, 2, 7, 3, 5, 1, 9, 5, 0, 2, 9,
        2, 0, 7, 5, 5, 4, 6, 6])


## 创建模型
`BertForSequenceClassification`已经封装了线性分类层的模型，**输出直接包含了损失**，不需要再进行自定义

In [19]:
model_path = '../../H/models/huggingface/bert-base-chinese'
model = BertForSequenceClassification.from_pretrained(
    model_path,  # 本地文件载入
    num_labels=10,  # 指定分类类别
    output_attentions=False,  # 输出注意力权重
    output_hidden_states=False,  # 输出隐藏状态
)
for param in model.parameters():  # 所有参数设置成 可训练
    param.requires_grad = True

model.cuda();

In [9]:
# 封装成函数

def create_classifier(model_path,
                      num_labels,
                      output_attentions=True,
                      output_hidden_states=True):
    model = BertForSequenceClassification.from_pretrained(
        model_path,  # 本地文件载入
        num_labels=num_labels,  # 指定分类类别
        output_attentions=output_attentions,  # 输出注意力权重
        output_hidden_states=output_hidden_states,  # 输出隐藏状态
    )
    for param in model.parameters():
        param.requires_grad = True

    model.cuda()
    return model




In [None]:
model = create_classifier(model_path=model_path, num_labels=10)

## 训练模型

In [21]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# 优化器
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


# 学习率规划
from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

In [10]:

# 预测精度
import numpy as np


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# 格式化时间显示
import time
import datetime


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

seed_val = 42
num_epochs = 2

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch in range(0, num_epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_epochs))
    print('Training...')

    t0 = time.time()

    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        # 返回 (loss,logits,hidden_state,attentions) 元组

        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

In [11]:
# 训练过程封装

from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup  # 学习率规划

device = torch.device("cuda" if torch.cuda.is_available else "cpu")


def train_model(model,
                train_dataloader,
                validation_dataloader,
                OPTIMIZER,
                device,
                num_epochs=10,
                random_seed=42,
                print_every=1000):
    # 优化器
    optimizer = OPTIMIZER(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps,
    )

    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    loss_values = []
    train_accs = []
    validation_accs = []

    for epoch in range(0, num_epochs):

        # ========================================
        #               Training
        # ========================================

        print("")
        print('======== Epoch {:} / {:} ========'.format(
            epoch + 1, num_epochs))
        print('Training...')

        t0 = time.time()

        total_loss, train_accuracy = 0, 0

        model.train()  # 训练模式

        for step, batch in enumerate(train_dataloader):

            if step % print_every == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))

            # 输入参数
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # 梯度归零
            model.zero_grad()

            # 前向计算，返回 (loss,logits,hidden_state,attentions) 元组
            outputs = model(b_input_ids,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            # 损失
            loss = outputs[0]
            total_loss += loss.item()

            # 反向传播
            loss.backward()

            # 梯度裁剪，避免梯度爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # 更新梯度
            optimizer.step()
            # 更新学习率
            scheduler.step()

            # 前向计算，预测结果
            logits = outputs[1]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_train_accuracy = flat_accuracy(logits, label_ids)
            train_accuracy += tmp_train_accuracy

        avg_train_loss = total_loss / len(train_dataloader)
        avg_train_accuracy = train_accuracy / len(train_dataloader)

        loss_values.append(avg_train_loss)
        train_accs.append(avg_train_accuracy)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print(
            "  Average training accuracy: {0:.4f}".format(avg_train_accuracy))
        print("  Training epcoh took: {:}".format(format_time(time.time() -
                                                              t0)))

        # ========================================
        #               Validation
        # ========================================

        print("")
        print("Running Validation...")

        t0 = time.time()

        model.eval()  # 验证模式

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for batch in validation_dataloader:

            batch = tuple(t.to(device) for t in batch)

            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():

                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

            logits = outputs[0]

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy

            nb_eval_steps += 1

        print("  Accuracy: {0:.4f}".format(eval_accuracy / nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))
        validation_accs.append(eval_accuracy / nb_eval_steps)

    print("")
    print("Training complete!")
    return model, loss_values, train_accs, validation_accs

In [31]:
import random, time
train_dataloader = DataGen(train_path,
                           tokenizer=bert_tokenizer,
                           num_samples=80000)(batch_size=32)
validation_dataloader = DataGen(dev_path,
                                tokenizer=bert_tokenizer)(batch_size=32)
model, loss_values, train_accs, validation_accs = train_model(
    model,
    train_dataloader,
    validation_dataloader,
    AdamW,
    device,
    num_epochs=5,
    print_every=500)


Training...

  Average training loss: 0.28
  Average training accuracy: 0.92
  Training epcoh took: 0:03:43

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:07

Training...

  Average training loss: 0.16
  Average training accuracy: 0.95
  Training epcoh took: 0:03:45

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:07

Training...

  Average training loss: 0.09
  Average training accuracy: 0.97
  Training epcoh took: 0:03:45

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:07

Training...

  Average training loss: 0.05
  Average training accuracy: 0.98
  Training epcoh took: 0:03:44

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:07

Training...

  Average training loss: 0.03
  Average training accuracy: 0.99
  Training epcoh took: 0:03:43

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:07

Training complete!


## 测试模型

In [12]:
for data in test_dataloader:
    input_data, input_mask, input_labels = data
    print(input_data)
    print(input_mask)
    print(input_labels)
    break

tensor([[ 101, 5125, 3209,  ...,    0,    0,    0],
        [ 101, 5401, 1957,  ...,    0,    0,    0],
        [ 101, 6225,  830,  ...,    0,    0,    0],
        ...,
        [ 101, 7770, 3926,  ...,    0,    0,    0],
        [ 101, 2791, 2255,  ...,    0,    0,    0],
        [ 101, 1367, 1213,  ...,    0,    0,    0]], dtype=torch.int32)
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)
tensor([1, 7, 5, 6, 1, 8, 6, 3, 8, 6, 6, 0, 4, 1, 5, 7, 0, 3, 6, 1, 6, 8, 6, 7,
        0, 7, 2, 7, 2, 4, 1, 7], dtype=torch.int32)


In [13]:
def evaluate(model, test_dataloader):
    t0 = time.time()

    model.eval()

    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in test_dataloader:

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1

    print("  Accuracy: {0:.4f}".format(eval_accuracy / nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

In [40]:
evaluate(model, test_dataloader)

  Accuracy: 0.9424
  Validation took: 0:00:07


> 原始 BERT 模型的精度为 0.9424

# ERNIE模型分类

In [42]:
ernie = BertForSequenceClassification.from_pretrained(
    '../../H/models/huggingface/ERNIE/',
    num_labels=10,  # 指定分类类别
    output_attentions=False,  # 输出注意力权重
    output_hidden_states=False,  # 输出隐藏状态
)

In [None]:
import transformers
from transformers import BertModel, BertTokenizer

In [None]:
ernie_tokenizer = BertTokenizer.from_pretrained(
    '../../H/models/huggingface/ERNIE/'
)

In [None]:
from torch.nn import CrossEntropyLoss, MSELoss


class ERNIE(nn.Module):
    def __init__(self, model_path, num_classes):
        super(ERNIE, self).__init__()

        self.bert = BertModel.from_pretrained(model_path)
        for param in self.bert.parameters():
            param.requires_grad = True

        self.dropout = nn.Dropout(0.1)  # dropout 层

        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_classes)  # 全连接层，用于最后的分类

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        pooled_output = outputs[0]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits, ) + outputs[2:]

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs

In [None]:
ernie = ERNIE('../../H/models/huggingface/ERNIE/', num_classes=10)

In [None]:
import random, time
model, loss_values, train_accs, validation_accs = train_model(
    ernie,
    train_dataloader,
    validation_dataloader,
    AdamW,
    device,
    num_epochs=10,
    print_every=500)

In [14]:
model_path = '../../H/models/huggingface/ERNIE/'
ernie =  create_classifier(model_path=model_path, num_labels=10)

In [18]:
%%time
train_dataloader = DataGen(train_path,
                           tokenizer=bert_tokenizer,
                           num_samples=80000)(batch_size=32)

CPU times: user 111 ms, sys: 8.2 ms, total: 120 ms
Wall time: 82 ms


In [19]:
ernie, loss_values, train_accs, validation_accs = train_model(
    ernie,
    train_dataloader,
    validation_dataloader,
    AdamW,
    device,
    num_epochs=5,
    print_every=500)


Training...
  Batch   500  of  2,500.    Elapsed: 0:00:44.


RuntimeError: CUDA error: device-side assert triggered

In [17]:
evaluate(ernie, test_dataloader)

RuntimeError: CUDA error: device-side assert triggered

In [19]:
for batch in test_dataloader:
    for t in batch:
        print(t.dtype)

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch
    break


torch.int64
torch.int64
torch.int64


In [20]:
t

tensor([0, 7, 3, 7, 3, 8, 0, 8, 5, 4, 2, 8, 8, 0, 2, 5, 8, 7, 0, 4, 4, 0, 6, 8,
        9, 1, 1, 1, 0, 2, 7, 0])

In [21]:
t.to(device)

RuntimeError: CUDA error: device-side assert triggered

In [22]:
t

tensor([0, 7, 3, 7, 3, 8, 0, 8, 5, 4, 2, 8, 8, 0, 2, 5, 8, 7, 0, 4, 4, 0, 6, 8,
        9, 1, 1, 1, 0, 2, 7, 0])

In [23]:
t.type()

AttributeError: 'Tensor' object has no attribute 'astype'

In [29]:
t.int().to(device)

RuntimeError: CUDA error: device-side assert triggered

In [28]:
t.float().type()

'torch.FloatTensor'

In [33]:
ttt = torch.tensor([0, 7, 3, 7, 3, 8, 0, 8, 5, 4, 2, 8, 8, 0, 2, 5, 8, 7, 0, 4, 4, 0, 6, 8,
        9, 1, 1, 1, 0, 2, 7, 0])
ttt.dtype

torch.int64

In [35]:
ttt.to(device)

RuntimeError: CUDA error: device-side assert triggered

In [36]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [37]:
ttt.to(device)

RuntimeError: CUDA error: device-side assert triggered

In [41]:
ttt.cuda()

RuntimeError: CUDA error: device-side assert triggered

In [42]:
!nvidia-smi

Fri May 15 18:17:47 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 440.48.02    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
| 31%   40C    P2    58W / 260W |   4333MiB / 11018MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [43]:
!dmesg

[32m[    0.000000] [0mLinux version 5.3.0-26-generic (buildd@lgw01-amd64-039) (gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #28~18.04.1-Ubuntu SMP Wed Dec 18 16:40:14 UTC 2019 (Ubuntu 5.3.0-26.28~18.04.1-generic 5.3.13)
[32m[    0.000000] [0m[33mCommand line[0m: BOOT_IMAGE=/boot/vmlinuz-5.3.0-26-generic root=/dev/mapper/ubuntu--vg-root ro quiet splash vt.handoff=1
[32m[    0.000000] [0mKERNEL supported cpus:
[32m[    0.000000] [0m  Intel GenuineIntel
[32m[    0.000000] [0m  AMD AuthenticAMD
[32m[    0.000000] [0m  Hygon HygonGenuine
[32m[    0.000000] [0m  Centaur CentaurHauls
[32m[    0.000000] [0m  zhaoxin   Shanghai  
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x002: 'SSE registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x004: 'AVX registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supportin

In [44]:
ttt.cuda()

RuntimeError: CUDA error: device-side assert triggered

In [45]:
!dmesg

[32m[    0.000000] [0mLinux version 5.3.0-26-generic (buildd@lgw01-amd64-039) (gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #28~18.04.1-Ubuntu SMP Wed Dec 18 16:40:14 UTC 2019 (Ubuntu 5.3.0-26.28~18.04.1-generic 5.3.13)
[32m[    0.000000] [0m[33mCommand line[0m: BOOT_IMAGE=/boot/vmlinuz-5.3.0-26-generic root=/dev/mapper/ubuntu--vg-root ro quiet splash vt.handoff=1
[32m[    0.000000] [0mKERNEL supported cpus:
[32m[    0.000000] [0m  Intel GenuineIntel
[32m[    0.000000] [0m  AMD AuthenticAMD
[32m[    0.000000] [0m  Hygon HygonGenuine
[32m[    0.000000] [0m  Centaur CentaurHauls
[32m[    0.000000] [0m  zhaoxin   Shanghai  
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x002: 'SSE registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supporting XSAVE feature 0x004: 'AVX registers'
[32m[    0.000000] [0m[33mx86/fpu[0m: Supportin

In [46]:
ttttttt = torch.tensor([1,2,3,4,5])
ttttttt

tensor([1, 2, 3, 4, 5])

In [47]:
ttttttt.cuda()

RuntimeError: CUDA error: device-side assert triggered

In [48]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [49]:
ttttttt.cuda()

RuntimeError: CUDA error: device-side assert triggered

In [22]:
bert_tokenizer.convert_ids_to_tokens(bert_tokenizer.encode("你号杠啊发到付啊啊"))

['[CLS]', '你', '号', '杠', '啊', '发', '到', '付', '啊', '啊', '[SEP]']