# 1 数据预处理


In [1]:
import paddle
import paddlenlp as ppnlp
from paddlenlp.metrics import ChunkEvaluator
from functools import partial
from paddlenlp.data import Stack, Pad, Tuple

## 1.1 加载自定义数据集

In [2]:
from paddlenlp.datasets import load_dataset
import pandas as pd

def read(data_path,is_test = False):
    if not is_test:
        data = pd.read_csv(data_path,encoding="gb18030")
        data.drop(index=18282,inplace=True)
        for index,row in data.iterrows():
            sentence = row[1]
            labels = row[2]
            yield {'text': sentence, 'label': labels}
    else:
        data = pd.read_csv(data_path,skiprows=40000,nrows=10000,encoding="gb18030")
        for index,row in data.iterrows():
            sentence = row[1]
            yield {'text': sentence}

train_ds = load_dataset(read, data_path="./work/BDCI/train.csv",is_test = False,lazy=False)
# test_ds = load_dataset(read, data_path="./work/BDCI/train.csv",is_test = True,lazy=False) 

## 1.2 定义数据转换函数，实现文字编码

In [3]:
def convert_example(example, tokenizer, max_seq_length=256,is_test=False):
    encoded_inputs = tokenizer(example["text"], max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    if not is_test:
        label = [example["label"]]
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

## 1.3 构造偏函数，将单条数据进行转换
此处需先定义tokenizer，再定义偏函数

In [4]:
batch_size = 64
max_seq_length = 312
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

[2022-10-28 21:01:45,033] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie/vocab.txt
100%|██████████| 90/90 [00:00<00:00, 2649.18it/s]


In [5]:
print(convert_example(train_ds[100],tokenizer=tokenizer,max_seq_length=128,is_test=False))
print(convert_example(test_ds[100],tokenizer=tokenizer,max_seq_length=128,is_test=True))

([1, 1073, 1169, 68, 2201, 4, 654, 4, 2670, 17, 5010, 136, 5010, 139, 39, 21, 4, 2746, 495, 4, 644, 219, 244, 484, 904, 308, 8, 4, 590, 12, 68, 73, 4, 87, 11, 644, 219, 244, 3376, 181, 488, 1769, 231, 1342, 4, 22, 171, 612, 8, 68, 2201, 1169, 1860, 2785, 1073, 4, 1079, 239, 9, 195, 1510, 1342, 681, 17, 627, 27, 136, 4, 145, 239, 1485, 192, 8, 119, 1387, 4225, 183, 12043, 803, 1079, 49, 4, 171, 612, 8, 16, 231, 4, 160, 39, 28, 1005, 12043, 1751, 181, 99, 663, 1631, 121, 8, 119, 594, 788, 245, 160, 39, 730, 1005, 12043, 644, 219, 244, 1751, 181, 99, 12, 222, 8, 119, 72, 245, 37, 1541, 17, 5010, 136, 5010, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],

NameError: name 'test_ds' is not defined

## 1.4 构造batchify_fn，在batch数据构造时进行padding

In [None]:
batchify_fn_train = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    Stack(dtype="int64")  # label
): [data for data in fn(samples)]
batchify_fn_test = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
): [data for data in fn(samples)]

## 1.5 构造dataloader

In [None]:
trans_func = partial(
    convert_example,
    tokenizer = tokenizer,    
    max_seq_length = max_seq_length,
    is_test = False)

train_ds = train_ds.map(trans_func)
# test_ds = test_ds.map(trans_func)



In [None]:
train_loader = paddle.io.DataLoader(
        dataset=train_ds,
        batch_size=64,
        collate_fn=batchify_fn_train,
        shuffle=True,
        return_list=True)
     
test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_size=64,
        shuffle=False,
        collate_fn=batchify_fn_test,
        return_list=True)

# 2.定义分类模型

In [None]:
model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=32)

# 3 训练模型

In [None]:
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()
epochs = 20


In [None]:
import paddle.nn.functional as F
def train(model,train_loader):
    global_step = 0
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_loader, start=1):
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()
            global_step += 1
            
            if global_step % 100 == 0 :
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
            # 反向梯度回传，更新参数
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
    model.save_pretrained('/home/aistudio/checkpoint')
    tokenizer.save_pretrained('/home/aistudio/checkpoint')

In [None]:
train(model,train_loader)

# 4 模型预测

## 4.1 加载模型

In [None]:
model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=32)
model_dict = paddle.load('/home/aistudio/checkpoint/model_state.pdparams')
model.set_dict(model_dict)

## 4.2 在测试集上预测

In [None]:
# label_map = dict(zip(label_idx.values(),label_idx.keys()))
# model.eval()
# predictions = []
# for input_ids, segment_ids in test_loader:
#     logits = model(input_ids, segment_ids)
#     probs = F.softmax(logits, axis=1)  
#     idx = paddle.argmax(probs, axis=1).numpy()
#     idx = idx.tolist()
#     labels = [label_map[i] for i in idx]
#     predictions.extend(labels)

In [None]:
# predictions[:10]

In [None]:
def pre_read(data_path):
    data = pd.read_csv(data_path,encoding="utf-8")
    for index,row in data.iterrows():
        sentence = row[1]
        yield {'text': sentence}

pre_func = partial(
    convert_example,
    tokenizer = tokenizer,    
    max_seq_length = max_seq_length,
    is_test = True)


pre_loader = load_dataset(pre_read, data_path="./work/BDCI/testA.csv",lazy=False) 
pre_ds = pre_loader.map(pre_func)       
pre_loader = paddle.io.DataLoader(
        dataset=pre_ds,
        batch_size=64,
        shuffle=False,
        collate_fn=batchify_fn_test,
        return_list=True)
model.eval()
predictions = []
for input_ids, segment_ids in pre_loader:
    logits = model(input_ids, segment_ids)
    probs = F.softmax(logits, axis=1)  
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    labels = [i for i in idx]
    predictions.extend(labels)

In [None]:
data = pd.read_csv(data_path="./work/BDCI/testA.csv",encoding="utf-8")
df = pd.DataFrame(data={"id":data["id"],"label":predictions})
df.to_csv("submisstion.csv")