# 层级知识点预测

In [None]:
import os
import yaml
import tqdm
import torch
import numpy as np
from EduNLP.Pretrain import BertTokenizer
from EduNLP.ModelZoo.bert import BertForKnowledgePrediction
from EduNLP.Pretrain import finetune_bert_for_knowledge_prediction
from EduNLP.ModelZoo import load_items

from utils import compute_perfs_per_layer, get_onehot_label_topk, metric, compute_perfs

In [None]:
train_data = load_items("/path/to/data/train.jsonl")
test_data = load_items("/path/to/data/test.jsonl")

pretrained_model_dir ="/path/to/bert/checkpoint"
checkpoint_dir = "/path/to/knowledge_model/checkpoint"

## 训练

In [None]:
# 以bert为例
data_params = {
    "stem_key": "ques_content",
    "label_key": "know_list"
}
train_params = {
    "num_train_epochs": 1,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    "no_cuda": True,
}
model_params = {
    "num_classes_list": [10, 27, 963],
    "num_total_classes": 1000,
}
 

"""
数据格式：
{
    'ques_content': 'question...',
    'know_list': [lay_1_id, lay_2_id, lay_3_id]
}
"""

# train without eval_items
finetune_bert_for_knowledge_prediction(
    train_data,
    checkpoint_dir,
    pretrained_model=pretrained_model_dir,
    train_params=train_params,
    data_params=data_params,
    model_params=model_params
)

## 加载模型和评估数据

In [None]:
# 针对多标签任务处理标签
class EvalDataset(torch.utils.data.Dataset):
    def __init__(self, data) -> None:
        self.data = data
        self.num_classes = model_params['num_classes_list']
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)

    def __getitem__(self, idx):
        text, labels = self.data[idx]["ques_content"], self.data[idx]["know_list"]
        encodings = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
        for k, v in encodings.items():
            encodings[k] = torch.squeeze(v, dim=0)
        one_hot_labels = [1. if idx in labels else 0. for idx in range(self.num_classes)]
        return encodings, torch.FloatTensor(one_hot_labels)

    def __len__(self):
        return len(self.data)

test_dataset  = EvalDataset(test_data)
eval_dataloader = EvalDataset(
    test_data,
    batch_size=1,
    shuffle=False,
    num_workers=4,
)

model = BertForKnowledgePrediction.from_pretrained(checkpoint_dir)

## 评估

In [None]:
device = "cuda" if not train_params["no_cuda"] else "cpu"

# 层级知识标签-配置信息
levels = len(model_params["num_classes_list"])
classes_offset_list = [0, 10, 37]
classes_border_list = [[0, 9], [10, 36], [37, 1000]] # 层级id边界
hierarchy_dict = {} # child_know_id_to_parent_know_id

# 评估top_k结果
top_k_list=[10, 20, 30]

model.eval()
perfs_per_layer = [np.array([0 for _ in range(4)], dtype=np.int32) for _ in range(levels)]
total_perfs = np.array([0 for _ in range(4)], dtype=np.int32)

k_total_perfs_list = [ np.array([0 for _ in range(4)], dtype=np.int32)for _ in range(len(top_k_list)) ]
for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):
    input_data, eval_batch_labels = eval_batch
    input_data = input_data.to(device)
    _, output_logits = model(**input_data)

    local_perfs_per_layer, local_perfs = compute_perfs_per_layer(
        output_logits.cpu().detach().numpy(),
        eval_batch_labels.cpu().detach().numpy(),
        hierarchy_dict,
        classes_border_list,
        keep_consistency=True
    )
    perfs_per_layer = [perfs_per_layer[idx] + local_perfs_per_layer[idx] for idx in range(levels)]
    total_perfs += local_perfs
    
    # for recall@k
    for i_k, k in enumerate(top_k_list):
        pred_topk = get_onehot_label_topk(
            classes_border_list, classes_offset_list, scores_list=output_logits.cpu().detach().numpy(), top_num=k)
        flat_pred_topk = np.array([x[3] for x in pred_topk])
        k_total_perfs = compute_perfs(flat_pred_topk, eval_batch_labels.cpu().detach().numpy().tolist())
        k_total_perfs_list[i_k] += k_total_perfs

# metric for overall
micro_precision, micro_recall, micro_f1, total_acc = metric(*total_perfs)
print(f"Eval Results: Micro-Precision: {micro_precision:.4f}, "
                + f"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}")

# metrics for per top_k
for i_k, k_total_perfs in enumerate(k_total_perfs_list):
    k = top_k_list[i_k]
    precision, recall, f1, acc = metric(*k_total_perfs)
    print(f"TOPK={k}: Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, F1@{k}: {f1:.4f}, Acc@{k}: {acc:.4f}")

# metrics for per layer
for layer_idx, perfs in enumerate(perfs_per_layer):
    precision, recall, f1, acc = metric(*perfs)
    print(f"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}")