## 区分度预估

In [None]:
import torch
from torch import nn
import numpy as np
from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig, DataCollatorWithPadding
import torch.nn.functional as F
from sklearn.metrics import ndcg_score, mean_squared_error
from  torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef
import os
import tqdm
from EduNLP.Pretrain import BertTokenizer 
from EduNLP.ModelZoo.base_model import BaseModel
from EduNLP.Pretrain import EduDataset
import json
from utils import pre_disc

ROOT = os.path.dirname(os.path.dirname(__file__))
DATA_DIR = os.path.join(ROOT, "data")
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

In [None]:
MAE = MeanAbsoluteError()
PCC = PearsonCorrCoef()
SCC = SpearmanCorrCoef()

### 加载数据，定义路径

In [None]:
output_dir = "path/to/output_dir" #设置模型保存路径
pretrained_model_dir = os.path.join(DATA_DIR, "bert_math_768") #以预训练的bert路径为例，也可以更换为其他模型的路径，如disenqnet, roberta等
checkpoint_dir = "path/to/discrimination_prediction_checkpoint"
train_items = pre_disc(os.path.join(DATA_DIR, "train", "ctt_train.csv")) #加载训练集
val_items = pre_disc(os.path.join(DATA_DIR, "test", "ctt_test.csv")) #加载测试集

### 训练

#### 定义网络结构

In [3]:
class BertForDiscriminationPrediction(BaseModel): 
    def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):
        super(BertForDiscriminationPrediction, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)
        hidden_size = self.bert.config.hidden_size
        self.classifier_dropout = classifier_dropout
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

        self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
        self.config['architecture'] = 'BertForDiscriminationPrediction'
        self.config = PretrainedConfig.from_dict(self.config)

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                labels=None,
                ):
      
        item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]

        logits = self.sigmoid(self.classifier(item_embed)).squeeze(0)
        loss = None
        if labels is not None:
            loss = F.mse_loss(logits, labels) if labels is not None else None
        return loss, logits
    
    @classmethod
    def from_config(cls, config_path, *args, **kwargs):
        with open(config_path, "r", encoding="utf-8") as rf:
            model_config = json.load(rf)
            model_config.update(kwargs)
            return cls(
                pretrained_model_dir=model_config['pretrained_model_dir'],
                classifier_dropout=model_config.get("classifier_dropout", 0.5),             
            )

#### 定义训练相关参数

In [6]:
class BertDataset(EduDataset):
    pass

class MyTrainer(Trainer):
    pass

def train_disc_pred(
                        output_dir,
                        pretrained_model_dir,
                        train_items=None,
                        val_items=None,
                        train_params=None):
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)
    model = BertForDiscriminationPrediction(pretrained_model_dir=pretrained_model_dir)
    model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
    # training parameters
    if train_params is not None:
        epochs = train_params['epochs'] if 'epochs' in train_params else 1
        batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64
        save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100
        save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2
        logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5
        gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \
            if 'gradient_accumulation_steps' in train_params else 1
        logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f"{ROOT}/log"
    else:
        # default
        epochs = 50
        batch_size = 1
        save_steps = 1000
        save_total_limit = 2
        logging_steps = 100
        gradient_accumulation_steps = 1
        logging_dir = f"{ROOT}/log"


    train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key="content", label_key="labels")
    eval_dataset = BertDataset(tokenizer=tokenizer, items=val_items, stem_key="content", label_key="labels")

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,

        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy = "steps", 
        eval_steps=logging_steps*5,
        
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        
        logging_steps=logging_steps,
        logging_dir=logging_dir,

        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=5e-5,
    )
    data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer)
    trainer = MyTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()      #训练模型
    trainer.save_model(output_dir)
    trainer.model.save_config(output_dir)
    tokenizer.save_pretrained(output_dir)    #保存训练后的模型

#### 训练模型

In [None]:
train_disc_pred(
        output_dir,
        pretrained_model_dir=pretrained_model_dir,
        train_items=train_items,
        val_items=val_items,
        train_params= None
)

### 测试

#### 加载测试集和模型

In [None]:
class EvalDataset(torch.utils.data.Dataset):
    def __init__(self, items, tokenizer):
        self.tokenizer =  tokenizer
        self.items = items
  
    def __getitem__(self, index):
        content, labels = self.items[index]["content"], self.items[index]["labels"]
        encodings = self.tokenizer(str(content), max_length=512, truncation=True, return_tensors="pt")
        for k, v in encodings.items():
            encodings[k] = v
        return encodings, torch.as_tensor([labels])
    
    def __len__(self):
        return len(self.items)
    
tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)
eval_dataloader = EvalDataset(
        items=val_items,
        tokenizer=tokenizer,
    )
model = BertForDiscriminationPrediction.from_pretrained(checkpoint_dir)

#### 在测试集上评估

In [None]:
def compute_metrics(pres, golds):
    logits = torch.as_tensor(pres)
    labels = torch.as_tensor(golds)
    ret = {
        "mae": MAE(logits, labels).tolist(),
        "mse": mean_squared_error(golds,  pres),
        "rmse": np.sqrt(mean_squared_error(golds,  pres)),
        "pcc": PCC(logits, labels).tolist(),
        "scc": SCC(logits, labels).tolist(),
        'ndcg @all, @10, @20, @30': testdata_metrics(golds, pres).tolist(),
    }
    return ret

def testdata_metrics(diff, pred):
    diff, pred = np.array(diff), np.array(pred)
    ndcg = []
    ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])
    ndcg = np.mean(ndcg, axis=0)
    return ndcg

model.eval()
pred_list = []
label_list = []
for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):
    input_data, eval_batch_labels = eval_batch
    _, output_logits = model(**input_data)
    pred_list.append(output_logits.tolist()[0])
    label_list.append(eval_batch_labels.tolist()[0])

results = compute_metrics(pred_list, label_list)
print(f"Test results: {results}")