### 测试模型的困惑度

In [None]:
import torch

from Dataset.SemEval16Task5Dataset import SemEvalXMLDataset
from CollateFn.CollateFnBERTology import CollateFnBERTology

testEnDataset = SemEvalXMLDataset(phrase="Test", language="english")
testEsDataset = SemEvalXMLDataset(phrase="Test", language="spanish")
testFrDataset = SemEvalXMLDataset(phrase="Test", language="french")

from torch.utils.data import DataLoader

testEnDataLoader = DataLoader(testEnDataset, batch_size=16, collate_fn=CollateFnBERTology.collate_fn, shuffle=False, drop_last=False)
testEsDataLoader = DataLoader(testEsDataset, batch_size=16, collate_fn=CollateFnBERTology.collate_fn, shuffle=False, drop_last=False)
testFrDataLoader = DataLoader(testFrDataset, batch_size=16, collate_fn=CollateFnBERTology.collate_fn, shuffle=False, drop_last=False)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataLoader = {
    'testEn': testEnDataLoader, # 一个句子最长有108个token
    'testEs': testEsDataLoader, # 282
    'testFr': testFrDataLoader # 204
}

In [None]:
from transformers import BertTokenizer, BertLMHeadModel, BertConfig

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
config = BertConfig.from_pretrained("bert-base-cased")
config.is_decoder = True
model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

prediction_logits = outputs.logits

In [None]:

def preTrain(model, dataLoader, optimizer, scheduler=None, isTrain=False, DEVICE="cpu"):
    # 微调
    if isTrain:
        model.train()
    else:
        model.eval()

    pplList = []
    for batchDataEncode in dataLoader:
        batchTextEncodePlus = batchDataEncode['batchTokenizerEncode']['batchTextEncodePlus']
        batchTextEncodePlus = batchTextEncodePlus
        
        nlls = []
        stride = 1
        for index in range(0, batchTextEncodePlus.input_ids.size(1), stride):
            begin_loc = max(index + stride - max_length, 0)
            end_loc = min(i + stride, encodings.input_ids.size(1))
            trg_len = end_loc - i  # may be different from stride on last loop
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss * trg_len
            
            nlls.append(neg_log_likelihood)

        ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
        pplList.append(ppl)
    
    return sum(pplList)/len(pplList)

def runBatchModel(batchDataEncode, model, DEVICE="cpu"):

    nlls = []
    stride = 1
    for index in range(0, batchDataEncode.input_ids.size(1), stride):
        begin_loc = max(index + stride - 512, 0)
        end_loc = min(i + stride, batchDataEncode.input_ids.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = batchDataEncode.input_ids[:, begin_loc:end_loc].to(DEVICE)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100


        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss * trg_len
            
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

    return {
        'loss': outputs.loss,
        'ppl': ppl
    }

In [None]:
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-base-cased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    sentence = "The biggest portions but adequate"
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    sen_len = len(tokenize_input)
    sentence_loss = 0.

    for i, word in enumerate(tokenize_input):
        # add mask to i-th character of the sentence
        tokenize_input[i] = '[MASK]'
        mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])

        output = model(mask_input)

        prediction_scores = output[0]
        softmax = nn.Softmax(dim=0)
        ps = softmax(prediction_scores[0, i]).log()
        word_loss = ps[tensor_input[0, i]]
        sentence_loss += word_loss.item()

        tokenize_input[i] = word
    ppl = np.exp(-sentence_loss/sen_len)
    print(ppl)


In [None]:
from transformers import BertTokenizer, BertLMHeadModel, BertConfig
import torch


tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
config = BertConfig.from_pretrained("bert-base-cased")
config.is_decoder = True
model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)

In [None]:
import numpy as np

def score(model, tokenizer, sentence,  mask_token_id=103):
  tensor_input = tokenizer.encode(sentence, return_tensors='pt')
  repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
  mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
  masked_input = repeat_input.masked_fill(mask == 1, mask_token_id)
  labels = repeat_input.masked_fill( masked_input != mask_token_id, -100)
  loss,_ = model(masked_input, masked_lm_labels=labels)
  result = np.exp(loss.item())
  return result

s = score(model, tokenizer, 't the biggest portions but adequate.')
print(s)


In [None]:
from torch import optim
from transformers import get_linear_schedule_with_warmup

optimizer = optim.AdamW(model.parameters(), lr=dataParams.LearningRate)
warm_up_ratio = 0.1 # 定义要预热的step
total_steps = (len(trainDataset) // dataParams.Batchsize) * dataParams.TrainEpochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * warm_up_ratio), num_training_steps=total_steps)

from torch.nn import NLLLoss
criterion = NLLLoss()

from ModelSummary.ModelOutputsRecord import ModelOutputsRecord

modelOutputsRecord = ModelOutputsRecord(dataParams = dataParams, phrases=['train', 'trial', 'test'])

from Model.ModelRun import ModelRun
for epoch in range(dataParams.TrainEpochs):
    print('*'*40 + ' '*10 + str(epoch) + ' '*10 + "*"*40)
    
    for phrase in ['train', 'trial', 'test']:
        print("\n"+"+"*20+' '*20 + phrase + ' '*20 + '+'*20 + '\n')
        epochModelOutputs = ModelRun.runEpochModel(model, criterion, dataLoader[phrase], optimizer, scheduler, isTrain=(phrase=='train'), DEVICE=DEVICE)
        evalResultDict = modelOutputsRecord.addEpochModelOutputs(epochModelOutputs, phrase=phrase)
        print(modelOutputsRecord.strEvalResultDict(evalResultDict))
    
    bestEvalResultDict = modelOutputsRecord.analyseModel()
    print("best iter is ", bestEvalResultDict['iter'])
    print('train: ', modelOutputsRecord.strEvalResultDict(bestEvalResultDict['train']))
    print('trial: ', modelOutputsRecord.strEvalResultDict(bestEvalResultDict['trial']))
    print('test : ', modelOutputsRecord.strEvalResultDict(bestEvalResultDict['test' ]))

modelOutputsRecord.dump()

In [None]:
import torch
from transformers import BertTokenizer, GPT2LMHeadModel
from torch.nn import CrossEntropyLoss


def cal_ppl_bygpt2():
    sens = ["the biggest portions but adequate."]
    tokenizer = BertTokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    inputs = tokenizer(sens, padding='max_length', max_length=50, truncation=True, return_tensors="pt")
    bs, sl = inputs['input_ids'].size()
    outputs = model(**inputs, labels=inputs['input_ids'])
    logits = outputs[1]
    # Shift so that tokens < n predict n
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = inputs['input_ids'][:, 1:].contiguous()
    shift_attentions = inputs['attention_mask'][:, 1:].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs, -1)
    meanloss = loss.sum(1) / shift_attentions.sum(1)
    ppl = torch.exp(meanloss).numpy().tolist()
    return ppl


if __name__ == '__main__':
    cal_ppl_bygpt2()
