## 区分度预估

In [1]:
import torch
from torch import nn
import numpy as np
from transformers.modeling_outputs import ModelOutput
from transformers import BertModel, TrainingArguments, Trainer
import torch.nn.functional as F
from sklearn.metrics import ndcg_score, mean_squared_error
from  torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef
import os
from EduNLP.Pretrain import BertTokenizer 
import json
from utils import  Dataset_bert, pre_disc
from common import ROOT, DATA_DIR

2023-06-28 03:30:52.333651: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-28 03:30:52.381526: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MAE = MeanAbsoluteError()
PCC = PearsonCorrCoef()
SCC = SpearmanCorrCoef()



### 加载数据，定义预训练模型路径

In [5]:
output_dir = os.path.join(ROOT, "output/discrimination") #模型保存路径
pretrained_model_dir = "../data/bert_math_768" #bert路径，也可以更换为其他模型的路径，如disenqnet, roberta等
train_items = pre_disc(os.path.join(DATA_DIR, "train", "ctt_train.csv")) #训练集
val_items = pre_disc(os.path.join(DATA_DIR, "test", "ctt_test.csv")) #测试集

### 定义网络结构

In [3]:
class DiscriminationPredictionOutput(ModelOutput):
    loss: torch.FloatTensor = None
    logits: torch.FloatTensor = None
    labels: torch.FloatTensor = None

class BertPrediction(nn.Module):
    
    def __init__(self, pretrained_model_dir=None, config=None, classifier_dropout=0.5):

        super(BertPrediction, self).__init__()

        self.bert = BertModel.from_pretrained(pretrained_model_dir) 
        
        hidden_size = self.bert.config.hidden_size
        self.classifier_dropout = classifier_dropout
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                content=None,
                labels=None,
                ):
        
        input_ids = content['input_ids'].squeeze(0)
        attention_mask = content['attention_mask'].squeeze(0)
        token_type_ids = content['token_type_ids'].squeeze(0)  
        item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]
        logits = self.sigmoid(self.classifier(item_embed))
        loss = F.mse_loss(logits.squeeze(0), labels)
        return DiscriminationPredictionOutput(
            loss = loss,
            logits = logits,
            labels = labels
        )

### 定义评价指标

In [4]:
def compute_metrics(pred):
    logits = torch.as_tensor(pred.predictions[0]).squeeze(0)
    logits = logits.view([logits.size()[0]],-1)
    labels = torch.as_tensor(pred.label_ids)
    pres = logits.numpy().tolist()
    golds = labels.numpy().tolist()
    ret = {
        "mae": MAE(logits, labels),
        "mse": mean_squared_error(golds,  pres),
        "rmse": np.sqrt(mean_squared_error(golds,  pres)),
        "pcc": PCC(logits, labels),
        "scc": SCC(logits, labels),
        'ndcg': testdata_metrics(golds, pres).tolist(),
    }
    return ret

def testdata_metrics(diff, pred):
    diff, pred = np.array(diff), np.array(pred)
    ndcg = []
    ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])
    ndcg = np.mean(ndcg, axis=0)
    return ndcg

### 定义训练和测试相关参数

In [6]:
class MyTrainer(Trainer):
    pass

def train_disc_pred(
                        output_dir,
                        pretrained_model_dir,
                        train_items=None,
                        val_items=None,
                        train_params=None):
    model = BertPrediction(pretrained_model_dir=pretrained_model_dir) 
    tokenizer = BertTokenizer(add_special_tokens=True)
  
    if train_params is not None:
        epochs = train_params['epochs'] if 'epochs' in train_params else 1
        batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64
        save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100
        save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2
        logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5
        gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \
            if 'gradient_accumulation_steps' in train_params else 1
        logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f"{ROOT}/log"
    else:
        # default
        epochs = 50
        batch_size = 1
        save_steps = 1000
        save_total_limit = 2
        logging_steps = 100
        gradient_accumulation_steps = 1
        logging_dir = f"{ROOT}/log"

    train_dataset = Dataset_bert(train_items, tokenizer)
    eval_dataset = Dataset_bert(val_items, tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,

        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy = "steps", # epoch
        eval_steps=logging_steps*5,
        
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        
        logging_steps=logging_steps,
        logging_dir=logging_dir,

        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=5e-5,
    )

    trainer = MyTrainer(
        model=model,
        args=training_args,
        data_collator=train_dataset.collate_fn,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    #trainer.evaluate()

### 训练和测试

In [None]:
train_disc_pred(
        output_dir,
        pretrained_model_dir=pretrained_model_dir,
        train_items=train_items,
        val_items=val_items,
        train_params= None
)