In [1]:
# 本文使用openprompt库 地址https://github.com/thunlp/OpenPrompt
# 包括入门范例、
# 以情感分类为例
from openprompt.data_utils import InputExample

In [2]:
# 情感分类共两个classes: positive, negative
classes = [
    "positive",
    "negative"
]

# 实例数据集共两个数据
dataset = [
    # text_a 是数据的输入文本，其他任务的数据可能有多条输入文本
    InputExample(
        guid = 0,
        text_a = "Albert Einstein was one of the greatest intellects of his time.",
    ),
    InputExample(
        guid = 1,
        text_a = "The film was badly made.",
    ),
]

In [3]:
# 定义预训练模型，这里用bert，这是因为根据prompt设计，想让模型输出[mask]位置短语，属于填空问题
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

  utils.DeprecatedIn35,
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# 定义prompt模板，这里是手动设计模板
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder": "text_a"} It was {"mask"}',
    tokenizer = tokenizer,
)

In [9]:
# 定义输出-label映射
# It was {mask} mask的位置输出是一个单词，我们还要将这个单词映射为"positive","negative"标签，这个过程称之为"Verbalizer"
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        "negative": ["bad"],
        "positive": ["good", "wonderful", "great"],
    },
    tokenizer = tokenizer
)

In [10]:
# 将前面的plm，promptTemplate，promptVerbalizer组合成promptModel
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
)

In [11]:
# 定义dataloader
from openprompt import PromptDataLoader
data_loader = PromptDataLoader(
    dataset = dataset,
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
)

tokenizing: 2it [00:00, 401.83it/s]


In [12]:
# 开始训练及测试。prompt的目的：统一预训练模型的训练模型与当前预测模型
# 使用带prompt的预训练MLM执行零跳推理
import torch
promptModel.eval()
with torch.no_grad():
    for batch in data_loader:
        #print("batch:")
        #print(batch)
        logits = promptModel(batch)
        #print("logits:")
        #print(logits)
        preds = torch.argmax(logits, dim=-1) # 返回指定维度最大值的序号
        print("pred:")
        print(preds)
        print("pred labels:")
        print(classes[preds])
        # 1

batch:
{"input_ids": [[101, 3986, 16127, 1108, 1141, 1104, 1103, 4459, 1107, 7854, 18465, 1116, 1104, 1117, 1159, 119, 1135, 1108, 103, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

logits:
tensor([[-2.2736, -1.4058]])
pred:
tensor([1])
pred labels:
negative


In [2]:
# tutorial 0_basic https://github.com/thunlp/OpenPrompt/blob/main/tutorial/0_basic.py
# load dataset
from datasets import load_dataset
import datasets
#print(datasets.list_datasets())
raw_dataset = load_dataset('super_glue', 'cb', cache_dir="../datasets/.cache/huggingface_datasets")
# raw_dataset['train'][0]
# from datasets import load_from_disk
# raw_dataset = load_from_disk("/home/hushengding/huggingface_datasets/saved_to_disk/super_glue.cb")

Downloading builder script:   0%|          | 0.00/9.47k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Downloading and preparing dataset super_glue/cb (download: 73.71 KiB, generated: 198.02 KiB, post-processed: Unknown size, total: 271.73 KiB) to ../datasets/.cache/huggingface_datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7...


Downloading data:   0%|          | 0.00/75.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to ../datasets/.cache/huggingface_datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [66]:
#raw_dataset['train'][8]
for item in raw_dataset['train']:
    if item['label'] != 0:
        print(item)

{'premise': "He's weird enough to have undressed me without thinking, according to some mad notion of the ``proper'' thing to do. Perhaps he thought I couldn't lie in bed with my clothes on.", 'hypothesis': "she couldn't lie in bed with her clothes on", 'idx': 23, 'label': 1}
{'premise': "I should dearly have liked to know whether they were Europeans or Americans, but I couldn't hear the accents. They appeared to be arguing. I hoped the white men weren't telling him to eliminate all witnesses because I don't believe it would have needed much persuasion.", 'hypothesis': 'eliminating all witnesses would have needed much persuasion', 'idx': 26, 'label': 1}
{'premise': "But the damage was done as far as my faith was concerned, which is probably why I went mad. So anyway, that Christmas Eve night confirmed my worst fears, it was like a kind of ``royal flush'' for the infant Jimbo. All three kings - Pa Santa and the King of Kings - all down the pan together... And to be honest I don't believ

In [5]:
from openprompt.data_utils import InputExample

# 将raw数据处理为InputExample形式
dataset = {}
for split in ['train', 'validation', 'test']:
    dataset[split] = []
    for data in raw_dataset[split]:
        input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)
print(dataset['train'][0])

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
  "text_b": "the language was peeled down",
  "tgt_text": null
}



In [6]:
# 使用t5 plm
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

  utils.DeprecatedIn35,


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

In [7]:
# 手动构造template
from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"} Question: {"placeholder":"text_b"}? Is it correct? {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

In [8]:
# 打印看一下template如何包装输入input的
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' Question:', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' the language was peeled down', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '? Is it correct?', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}], {'guid': 0, 'label': 0}]


In [9]:
# 现在包装好的input准备好传给tokenizer
# 可以使用你自己的tokenizer，不过我们推荐使用wrapped tokenizer
# 如果你使用我们的 "load_plm"函数，wrapped tokneizer已经给出，否则需要选择`openprompt.plms.__init__.py`中合适的配置的wrapper
# 注意使用t5作为plm时，我们只需要将<pad> <extra_id_0> <eos>传给decoder
# 损失在<extra_id_0>计算，因此传递decoder_max_length=3可以节省空间

#wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
# or
from openprompt.plms import T5TokenizerWrapper
wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

In [10]:
# 可视化tokenized example
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

{'input_ids': [94, 47, 3, 9, 1561, 1612, 5, 933, 1545, 323, 68, 14014, 323, 5, 555, 429, 497, 34, 47, 158, 400, 26, 323, 5, 11860, 10, 8, 1612, 47, 158, 400, 26, 323, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'decoder_input_ids': [0, 32099, 0], 'loss_ids': [0, 1, 0]}
['▁It', '▁was', '▁', 'a', '▁complex', '▁language', '.', '▁Not', '▁written

In [50]:
model_inputs['train'][0]['input_ids']

[94,
 47,
 3,
 9,
 1561,
 1612,
 5,
 933,
 1545,
 323,
 68,
 14014,
 323,
 5,
 555,
 429,
 497,
 34,
 47,
 158,
 400,
 26,
 323,
 5,
 11860,
 10,
 8,
 1612,
 47,
 158,
 400,
 26,
 323,
 3,
 58,
 27,
 7,
 34,
 2024,
 58,
 32099,
 3,
 5,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [11]:
# 下面将整个数据集转化为input格式
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


In [12]:
# 使用提供的`PromptDataLoader`可以将上述转化为`torch.DataLoader`格式
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,  # 这里根据max_seq_length,将最大长度由128扩展到256
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
# next(iter(train_dataloader))

tokenizing: 250it [00:01, 155.56it/s]


In [55]:
next(iter(train_dataloader))['input_ids'].shape

torch.Size([4, 256])

In [16]:
# 定义verbalizer，也就是将logits映射到最终label概率
from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
                        label_words=[["yes"], ["no"], ["maybe"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

Parameter containing:
tensor([[[4273]],

        [[ 150]],

        [[2087]]])
tensor([[-1.8581, -1.0345, -0.7161],
        [-0.2899, -2.4579, -1.7953]])


In [17]:
# 使用PromptForClassification组装
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [19]:
# 查看prompt_model里的参数
for n,p in prompt_model.named_parameters():
    print(n)
    print(p)

prompt_model.plm.shared.weight
Parameter containing:
tensor([[ -0.7539,   0.5977,  -2.4375,  ...,   1.2500,  -0.7891,   3.5156],
        [ 11.3750,  -4.8750,   9.0625,  ...,   4.8438,  14.3750,  -5.7812],
        [-16.6250,  11.0625, -20.8750,  ...,  10.6875,  22.2500,  25.0000],
        ...,
        [  2.2344,   6.7500, -11.0625,  ..., -11.3125,  13.5625,  16.6250],
        [  4.2500,   5.1250, -12.2500,  ..., -11.9375,  13.5000,  17.0000],
        [  4.0625,   6.9688, -12.2500,  ..., -11.3750,  11.9375,  16.6250]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.encoder.block.0.layer.0.SelfAttention.q.weight
Parameter containing:
tensor([[ 0.0762, -0.0471,  0.0309,  ...,  0.0143, -0.0491, -0.0593],
        [ 0.0381, -0.0075,  0.0003,  ..., -0.0131, -0.0308, -0.0157],
        [-0.0047, -0.0262, -0.0298,  ..., -0.0193,  0.0520, -0.0500],
        ...,
        [ 0.0066, -0.0496,  0.0422,  ...,  0.0474,  0.0308, -0.0200],
        [-0.0176,  0.0217, -0.0289,  ...,  0.0413, -0.

Parameter containing:
tensor([[-0.0081,  0.0093, -0.0042,  ..., -0.0635,  0.0486,  0.0649],
        [-0.0640,  0.0266, -0.0029,  ..., -0.0366, -0.0471,  0.1138],
        [-0.0684,  0.0540,  0.0427,  ...,  0.0327, -0.0293,  0.0040],
        ...,
        [ 0.0488, -0.0339, -0.0145,  ...,  0.0136,  0.0178, -0.0322],
        [ 0.0400,  0.0079,  0.0005,  ..., -0.0104, -0.0108,  0.0337],
        [-0.0713,  0.0142, -0.0432,  ..., -0.0454, -0.0062,  0.0150]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.encoder.block.2.layer.0.SelfAttention.k.weight
Parameter containing:
tensor([[ 0.2314,  0.5625, -1.0703,  ..., -0.0286,  0.7891,  0.0879],
        [-0.2412,  0.3047,  0.5859,  ..., -0.0603, -0.2441,  0.4863],
        [-0.3926, -0.6289,  0.1299,  ...,  0.7188,  0.1270, -0.1953],
        ...,
        [-0.1934,  0.1123,  0.2471,  ...,  0.0283,  0.1436,  0.2373],
        [ 0.2656, -0.3965,  0.1836,  ..., -0.1436, -0.4648, -0.4023],
        [-0.0942,  0.0664,  0.0535,  ...,  0.6172, 

Parameter containing:
tensor([ 1.8164e-01,  1.7676e-01,  1.8066e-01,  2.1484e-01,  1.3965e-01,
         1.4746e-01,  5.1270e-02,  2.0996e-01,  9.5215e-02,  1.5625e-01,
         1.9727e-01,  1.8359e-01,  1.1279e-01,  1.7480e-01,  1.3379e-01,
         1.7188e-01,  1.7578e-01,  1.7676e-01,  2.3145e-01,  1.5039e-01,
         1.3086e-01,  1.6504e-01,  6.0547e-02,  1.4258e-01,  1.4648e-01,
         1.8848e-01,  1.3574e-01,  1.6797e-01,  1.7480e-01,  1.3184e-01,
         1.5332e-01,  1.5332e-01,  1.7480e-01,  1.3281e-01,  1.8652e-01,
         1.6016e-01,  1.4746e-01,  2.5391e-01,  1.2891e-01,  2.2070e-01,
         1.7090e-01,  1.7578e-01,  1.9629e-01,  1.7383e-01,  9.7168e-02,
         1.4746e-01,  1.4453e-01,  1.5332e-01,  1.8066e-01,  8.8379e-02,
         2.2363e-01,  1.7676e-01,  1.3965e-01,  1.3281e-01, -9.7156e-06,
         1.5332e-01,  5.8350e-02,  1.6797e-01,  1.7285e-01,  1.5625e-01,
         1.8945e-01,  1.3184e-01,  1.7676e-01,  1.4551e-01,  1.4453e-01,
         5.7373e-02,  1.5332e

Parameter containing:
tensor([[ 1.2734,  0.0530,  1.0312,  ..., -0.3809, -0.1689, -0.5234],
        [ 0.0476, -0.0679, -0.3672,  ...,  0.7578, -0.9141, -0.2002],
        [-0.3438,  0.1982,  0.7578,  ..., -0.7344,  1.2578, -0.4941],
        ...,
        [-0.1670,  0.4160,  0.2812,  ...,  0.1182, -0.3477,  0.1123],
        [ 0.7148, -0.0151, -0.9023,  ...,  1.1719, -0.0427,  0.0253],
        [-0.9492,  0.3457,  0.1748,  ..., -0.1221, -0.0830, -0.0232]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.encoder.block.7.layer.0.SelfAttention.o.weight
Parameter containing:
tensor([[-1.3984, -0.0542,  0.4102,  ...,  0.7852,  0.1484, -0.3145],
        [-0.7148,  0.0060, -0.1777,  ..., -0.0537,  0.8164,  1.2266],
        [-0.7578,  0.0557,  0.1060,  ...,  0.0131,  1.2266, -1.0156],
        ...,
        [-0.3301, -1.5781,  1.1797,  ..., -0.9570,  1.2969,  0.9336],
        [ 0.2090,  0.4453, -0.7695,  ..., -0.3223,  0.1094,  0.3711],
        [ 0.2031, -0.1445,  0.2734,  ...,  0.9219, 

Parameter containing:
tensor([1.3047, 1.1875, 1.1641, 1.2500, 1.3906, 1.3281, 0.3242, 1.3281, 1.5234,
        1.4062, 1.1719, 1.1562, 1.2578, 1.2812, 1.3438, 1.3281, 1.2578, 1.2734,
        0.8672, 1.3438, 1.3750, 1.3516, 3.5781, 1.2656, 1.4062, 1.2891, 1.3594,
        1.2109, 1.3594, 1.3203, 1.3516, 1.2891, 1.3516, 1.3203, 1.2188, 1.3516,
        1.3828, 1.2656, 1.3359, 1.7578, 1.2109, 1.3203, 0.9062, 1.3359, 3.1875,
        1.3984, 1.1250, 1.3203, 1.3125, 0.5117, 1.2422, 1.3125, 1.4922, 1.3281,
        1.3047, 1.2500, 2.9062, 1.3125, 1.4219, 1.4297, 1.3125, 1.3984, 1.3594,
        1.3125, 1.3906, 1.2266, 1.3516, 1.3281, 1.3281, 1.3047, 1.3438, 1.0000,
        1.3203, 1.3672, 1.3438, 1.3984, 1.3203, 1.2734, 1.3203, 1.3672, 1.2812,
        1.2656, 1.3281, 1.2656, 1.3203, 1.3281, 1.2266, 1.3281, 1.3203, 1.3281,
        1.3906, 1.3438, 1.2969, 1.3516, 1.4844, 1.3594, 1.3594, 1.3203, 1.3516,
        1.3828, 1.2578, 1.2734, 0.7852, 1.3047, 1.2969, 1.5859, 1.3828, 1.4141,
        1.3438, 2.

Parameter containing:
tensor([[-0.1079, -0.0205, -0.1611,  ..., -0.2637,  0.2891,  0.5078],
        [ 0.5234, -0.0415,  0.2754,  ..., -0.2461,  0.0265,  0.2520],
        [ 0.2178,  0.3945, -0.1143,  ..., -0.1035,  0.3125, -0.2178],
        ...,
        [ 0.0981, -0.0078,  0.2734,  ...,  0.3105, -0.0913,  0.0894],
        [-0.1309,  0.0859,  0.2969,  ...,  0.1289,  0.1064, -0.0581],
        [-0.0874,  0.3125, -0.1533,  ..., -0.2461, -0.0952, -0.2275]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.decoder.block.0.layer.1.EncDecAttention.o.weight
Parameter containing:
tensor([[ 0.1187, -0.2188, -0.2305,  ...,  0.0669,  0.2246,  0.3125],
        [-0.9453,  1.1875,  0.1533,  ...,  1.7969, -0.5039, -0.7578],
        [-1.3125,  0.2539, -0.1641,  ...,  1.7969,  0.6719,  1.8203],
        ...,
        [ 0.4902, -0.1592,  0.2695,  ...,  0.1069,  0.4551,  0.0830],
        [-0.6602, -0.2109, -0.0574,  ..., -0.3379, -0.4355,  0.0923],
        [ 0.0991,  0.0610,  0.2812,  ...,  0.5039

Parameter containing:
tensor([ 7.3730e-02,  2.8442e-02,  3.5645e-02,  1.1328e-01,  8.1543e-02,
         5.9326e-02,  8.0566e-02,  9.1309e-02,  6.1035e-02,  8.2031e-02,
         6.1523e-02,  9.4238e-02,  6.8848e-02,  5.9814e-02,  7.6660e-02,
         8.3496e-02,  7.2266e-02,  5.4932e-02,  7.5684e-02,  9.7656e-02,
         8.2520e-02,  8.0078e-02,  8.5449e-02,  8.8867e-02,  8.3984e-02,
         6.3477e-02,  8.1543e-02,  5.7861e-02,  7.9590e-02,  8.6914e-02,
         8.2520e-02,  4.1748e-02,  8.8379e-02,  8.3496e-02,  8.9844e-02,
         6.0303e-02,  9.5703e-02,  6.5918e-02,  7.7637e-02,  8.9844e-02,
         9.7168e-02,  8.0078e-02,  7.3730e-02,  8.0566e-02,  4.1992e-02,
         7.7637e-02,  1.0449e-01,  8.1055e-02,  9.0820e-02,  8.4961e-02,
         8.0566e-02,  7.6172e-02,  7.2266e-02,  7.9590e-02,  1.4551e-01,
         6.4453e-02,  6.9824e-02,  8.1055e-02,  8.0078e-02,  8.4473e-02,
         7.1289e-02,  8.4961e-02,  8.0566e-02,  7.9102e-02,  8.8867e-02,
         6.4453e-02,  8.0566e

Parameter containing:
tensor([ 2.3438,  1.7578,  1.6406,  2.2656,  2.3750,  2.6719,  2.0469,  2.6719,
         1.8672,  2.1250,  3.7500,  2.5312,  1.9453,  2.8594,  2.2969,  2.3750,
         2.2656,  1.7969,  2.2969,  2.3594,  2.3438,  2.4219,  2.2812,  2.3438,
         2.5000,  1.9297,  2.3438,  2.5000,  2.3281,  2.4062,  2.5000,  1.6406,
         2.5469,  2.2344,  2.5000,  1.9375,  2.3438,  1.4922,  2.1719,  2.4844,
         2.0781,  2.3125,  2.2969,  2.4531,  1.5938,  2.1719,  2.2812,  2.4688,
         2.3438,  2.6094,  2.1562,  2.3281,  2.3750,  2.2969,  2.9062,  2.5000,
         2.2031,  2.3281,  2.2500,  2.3594,  2.1406,  2.5156,  2.3281,  2.4062,
         2.4375,  1.1016,  2.2969,  2.3281,  1.9609,  2.2812,  2.1875,  2.2031,
         2.1406,  2.2812,  2.3281,  2.1875,  2.4844,  2.4531,  2.3125,  2.2500,
         2.2500,  2.2656,  2.4688,  1.8516,  2.1406,  2.3750,  2.3750,  2.3906,
         2.3750,  2.4844,  2.0469,  2.1875,  2.2969,  2.5625,  2.5312,  2.3750,
         2.3438,  

Parameter containing:
tensor([[ 0.0566,  0.0062, -0.0300,  ...,  0.0083, -0.0011,  0.0172],
        [ 0.0043,  0.0137,  0.0447,  ...,  0.0593,  0.0157,  0.0173],
        [-0.0260, -0.0359,  0.0728,  ...,  0.0197,  0.0190,  0.0071],
        ...,
        [-0.0469, -0.0192,  0.0009,  ...,  0.0292, -0.0024, -0.0425],
        [-0.0635, -0.0259, -0.0065,  ...,  0.0048, -0.0047, -0.0267],
        [-0.0082,  0.0199, -0.0588,  ...,  0.0267, -0.0192,  0.0049]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.decoder.block.7.layer.0.SelfAttention.k.weight
Parameter containing:
tensor([[ 0.2373, -0.3965,  0.0195,  ..., -0.1230,  0.3672, -0.1426],
        [-0.1289, -0.0874,  0.0143,  ...,  0.0491,  0.0615,  0.1035],
        [-0.2930,  0.3594, -0.1797,  ...,  0.0262,  0.1631,  0.1089],
        ...,
        [ 0.0219, -0.4336, -0.0070,  ...,  0.0801,  0.0270, -0.3086],
        [-0.2832, -0.0549, -0.1602,  ..., -0.0859, -0.3633,  0.1138],
        [-0.3867, -0.8398, -0.0098,  ...,  0.0640, 

Parameter containing:
tensor([[ 1.6641, -1.5234, -0.0693,  ...,  1.0859,  0.6016, -0.7461],
        [ 0.7539,  0.8242,  0.0082,  ...,  0.1377, -0.9570, -0.9688],
        [-0.8789,  0.5195,  0.4844,  ..., -0.7695, -0.2910,  0.7344],
        ...,
        [-0.0762, -0.9023, -0.8086,  ...,  0.6797, -0.1543,  0.2246],
        [ 0.4492, -0.3848, -0.7617,  ...,  0.7305,  0.6992,  0.2031],
        [-0.8242, -0.2617,  0.2793,  ...,  0.0601,  0.2227,  0.1914]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.decoder.block.9.layer.0.SelfAttention.o.weight
Parameter containing:
tensor([[-0.0457, -0.8672, -0.0591,  ...,  0.6641, -0.4688,  0.2734],
        [-0.1377, -0.3984,  0.5234,  ...,  0.8281,  0.4707,  0.4199],
        [ 0.0684,  1.0469,  0.9570,  ..., -0.1660,  0.7188, -0.3027],
        ...,
        [ 1.5938,  0.7578,  0.2539,  ..., -0.8906, -1.4453, -0.0952],
        [ 0.4180,  0.4121,  1.3906,  ...,  0.1426,  0.5312,  0.8633],
        [-1.6641,  1.6484,  0.9883,  ...,  0.7812, 

Parameter containing:
tensor([[ 1.7891, -0.3184, -1.1250,  ..., -1.0156,  0.3418,  0.9883],
        [ 0.0255, -0.3301, -0.0742,  ..., -0.3633,  0.1689,  0.4473],
        [-0.1230,  1.1875,  0.2090,  ...,  0.7539, -0.6367, -0.1426],
        ...,
        [-0.1357, -0.0898,  0.6523,  ...,  1.3438, -0.5586,  0.4648],
        [-0.5977,  0.3359, -0.2109,  ..., -0.8359,  0.1250,  0.2969],
        [-0.7852, -0.1226,  0.2969,  ...,  0.9258,  0.0732,  1.1719]],
       device='cuda:0', requires_grad=True)
prompt_model.plm.decoder.block.11.layer.0.layer_norm.weight
Parameter containing:
tensor([ 3.1641e-01,  3.5352e-01,  6.1328e-01,  2.8711e-01,  3.2812e-01,
         3.5742e-01,  4.0039e-01,  3.4961e-01,  4.5117e-01,  3.8086e-01,
         3.3594e-01,  3.4961e-01,  3.6523e-01,  4.0430e-01,  3.3008e-01,
         3.2617e-01,  3.8086e-01,  4.1211e-01,  4.5312e-01,  3.8867e-01,
         3.1445e-01,  3.2227e-01,  4.8438e-01,  3.5742e-01,  3.1055e-01,
         4.9219e-01,  3.1641e-01,  5.2344e-01,  3.632

In [27]:
# Now the training is standard
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
# 避免过拟合
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)



Epoch 0, average loss: 0.8800722360610962
Epoch 1, average loss: 0.031409017741680145
Epoch 2, average loss: 0.0015051163209136575
Epoch 3, average loss: 0.0034735492590698414
Epoch 4, average loss: 0.00035082946124020964
Epoch 5, average loss: 0.00021342094260035083
Epoch 6, average loss: 0.0001429102776455693
Epoch 7, average loss: 0.0002875708451028913
Epoch 8, average loss: 0.00012032450831611641
Epoch 9, average loss: 5.591935769189149e-05


tokenizing: 56it [00:00, 274.23it/s]


0.9464285714285714


In [59]:
print(allpreds)

[2, 2, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 1, 2, 1, 1, 0, 0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 1, 0]


In [58]:
for step, inputs in enumerate(train_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    print("inputs: ", inputs["input_ids"].shape)
    print(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))
    logits = prompt_model(inputs)
    labels = inputs['label']
    print("logits: ",logits)
    print("labels: ",labels)
    print("pred:", torch.argmax(logits[0], dim=-1), "")

inputs:  torch.Size([4, 256])
['▁B', ':', '▁U', 'h', '-', 'huh', '.', '▁It', ',', '▁I', '▁mean', ',', '▁I', '▁don', "'", 't', '▁know', ',', '▁I', '▁don', "'", 't', '▁think', '▁George', '▁Bush', '▁will', '▁make', '▁the', '▁American', '▁people', '▁happy', '▁with', '▁nine', 't', 'y', '-', 's', 'even', '▁cent', 's', '▁', 'a', '▁week', '.', '▁A', ':', '▁No', ',', '▁no', ',', '▁not', '▁at', '▁all', '.', '▁B', ':', '▁I', '▁just', '▁don', "'", 't', '▁think', '▁it', '▁was', '▁', 'a', '▁well', '▁thought', '▁out', '▁incentive', '.', '▁Question', ':', '▁it', '▁was', '▁', 'a', '▁well', '▁thought', '▁out', '▁incentive', '▁', '?', '▁I', 's', '▁it', '▁correct', '?', '<extra_id_0>', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

logits:  tensor([[-3.6956e-05, -1.1279e+01, -1.0627e+01],
        [-1.2712e+01, -5.2452e-06, -1.2995e+01],
        [-1.1394e+01, -1.6213e-05, -1.2220e+01],
        [-4.6492e-06, -1.3113e+01, -1.2839e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([0, 1, 1, 0], device='cuda:0')
pred: tensor(0, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁It', '▁seemed', '▁impossible', '▁that', '▁anyone', '▁could', '▁endure', '▁such', '▁pain', '▁for', '▁so', '▁long', ',', '▁but', '▁at', '▁last', '▁the', '▁doors', '▁of', '▁the', '▁Renault', '▁', 's', 'l', 'am', 'med', '▁and', '▁there', '▁was', '▁comparativ', 'e', '▁silence', '.', '▁The', '▁engine', '▁was', '▁started', '▁up', ',', '▁rev', 'ving', '▁violent', 'ly', '▁as', '▁the', '▁car', '▁was', '▁turned', '▁round', '▁on', '▁the', '▁narrow', '▁road', '.', '▁John', '▁could', '▁tell', '▁that', '▁it', '▁was', '▁being', '▁driven', '▁back', '▁up', '▁the', '▁hill', '▁towards', '▁Put', 'n', 'a', '.', '▁Question', ':', '▁the', '▁car', '

logits:  tensor([[-4.6254e-05, -1.0225e+01, -1.1517e+01],
        [-1.3861e+01, -1.0729e-06, -1.5554e+01],
        [-2.0981e-05, -1.2154e+01, -1.1062e+01],
        [-7.9158e-05, -1.1160e+01, -9.6419e+00]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([0, 1, 0, 0], device='cuda:0')
pred: tensor(0, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁Just', '▁when', '▁you', '▁think', '▁you', '▁', "'", 've', '▁got', '▁it', '▁straight', ',', '▁along', '▁comes', '▁the', '▁Fool', '▁with', '▁his', '▁', 'pig', "'", 's', '▁bladder', '▁and', '▁who', 'p', 's', '▁you', '▁on', '▁the', '▁nose', '.', '▁By', '▁the', '▁way', ',', '▁I', "'", 'm', '▁no', '▁idiot', '.', '▁I', '▁could', '▁tell', '▁G', 'illian', '▁and', '▁Stuart', '▁weren', "'", 't', '▁thrilled', '▁to', '▁see', '▁me', '▁at', '▁the', '▁airport', '.', '▁Question', ':', '▁G', 'illian', '▁and', '▁Stuart', '▁weren', "'", 't', '▁thrilled', '▁to', '▁see', '▁her', '▁at', '▁the', '▁airport', '▁', '?', '▁I', 's', '▁it', '▁correct', '

logits:  tensor([[-1.7047e-05, -1.2677e+01, -1.1182e+01],
        [-1.2875e-05, -1.2688e+01, -1.1532e+01],
        [-1.4245e+01, -1.6689e-06, -1.3844e+01],
        [-3.8267e-05, -1.1655e+01, -1.0428e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([0, 0, 1, 0], device='cuda:0')
pred: tensor(0, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁I', '▁can', "'", 't', '▁afford', '▁to', '▁get', '▁bo', 'gged', '▁down', '▁in', '▁the', '▁', 'weed', 's', '.', '▁But', '▁at', '▁least', '▁you', '▁know', '▁she', '▁did', '▁leave', '.', '▁Maybe', '▁', 'a', '▁coincidence', '▁maybe', '▁the', '▁two', '▁girls', '▁talked', '▁on', '▁the', '▁phone', '▁decided', '▁they', '▁', "'", 'd', '▁both', '▁had', '▁enough', '.', '▁Question', ':', '▁the', '▁two', '▁girls', '▁had', '▁both', '▁had', '▁enough', '▁', '?', '▁I', 's', '▁it', '▁correct', '?', '<extra_id_0>', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

logits:  tensor([[-1.3585e+01, -1.5497e-06, -1.5157e+01],
        [-1.1540e-04, -9.9900e+00, -9.5738e+00],
        [-2.1577e-05, -1.1003e+01, -1.2222e+01],
        [-1.2656e+01, -5.1260e-06, -1.3124e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([1, 0, 0, 1], device='cuda:0')
pred: tensor(1, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁B', ':', '▁I', '▁did', ',', '▁too', '.', '▁A', ':', '▁I', '▁mean', ',', '▁it', '▁was', '▁just', '▁more', '▁for', '▁my', '▁money', '.', '▁B', ':', '▁Yeah', '.', '▁I', '▁didn', "'", 't', '▁think', '▁it', '▁was', '▁too', '▁long', '▁at', '▁all', '.', '▁Question', ':', '▁it', '▁was', '▁too', '▁long', '▁', '?', '▁I', 's', '▁it', '▁correct', '?', '<extra_id_0>', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

logits:  tensor([[-1.5378e-05, -1.2125e+01, -1.1529e+01],
        [-1.2241e+01, -7.0334e-06, -1.3033e+01],
        [-1.3784e+01, -1.1921e-06, -1.5892e+01],
        [-3.1949e-05, -1.0845e+01, -1.1297e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([0, 1, 1, 0], device='cuda:0')
pred: tensor(0, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁This', '▁was', '▁', 'a', '▁sheer', '▁waste', '▁of', '▁time', '.', '▁He', '▁would', '▁probably', '▁land', '▁and', '▁then', '▁tell', '▁them', '▁to', '▁walk', '▁back', '.', '▁When', '▁she', '▁glance', 'd', '▁at', '▁him', '▁again', '▁', 'he', '▁looked', '▁very', '▁grim', '▁and', '▁she', '▁wondered', '▁', 'if', '▁she', '▁should', '▁have', '▁told', '▁Mit', 'ch', '▁that', '▁', 'he', '▁might', '▁well', '▁have', '▁', 'a', '▁lot', '▁of', '▁opportunity', '▁to', '▁photograph', '▁Spain', '▁', '-', '▁on', '▁foot', '▁as', '▁', 'he', '▁', 'walked', '▁back', '▁to', '▁Mal', 'aga', '.', '▁Question', ':', '▁Mit', 'ch', '▁might', '▁well', '▁have

logits:  tensor([[-1.2699e+01, -4.1723e-06, -1.3653e+01],
        [-1.2912e+01, -2.8610e-06, -1.4903e+01],
        [-9.3716e+00, -9.4909e+00, -1.6071e-04],
        [-1.1087e-05, -1.2281e+01, -1.1951e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([1, 1, 2, 0], device='cuda:0')
pred: tensor(1, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁B', ':', '▁But', ',', '▁', 'u', 'h', ',', '▁', 'if', '▁the', '▁wind', '▁comes', '▁basically', '▁from', '▁the', '▁south', '▁it', '▁can', '▁be', '▁really', '▁bad', '.', '▁A', ':', '▁U', 'h', '-', 'huh', '.', '▁B', ':', '▁U', 'h', ',', '▁the', '▁State', '▁of', '▁Wisconsin', ',', '▁as', '▁', 'a', '▁matter', '▁of', '▁fact', ',', '▁', 'u', 'h', ',', '▁started', '▁some', '▁litigation', '▁against', '▁Illinois', '▁because', '▁of', '▁the', '▁air', '▁pollution', '▁we', '▁were', '▁getting', '.', '▁A', ':', '▁U', 'h', '-', 'huh', '.', '▁B', ':', '▁U', 'h', ',', '▁I', '▁don', "'", 't', '▁think', '▁it', "'", 's', '▁going', '▁to', '▁go', '▁

logits:  tensor([[-8.8164e+00, -9.6814e+00, -2.1072e-04],
        [-1.2239e+01, -1.3828e-05, -1.1620e+01],
        [-1.4133e+01, -8.3447e-07, -1.5847e+01],
        [-1.2320e+01, -5.3644e-06, -1.3920e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([2, 1, 1, 1], device='cuda:0')
pred: tensor(2, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁A', ':', '▁That', '▁is', '▁the', '▁reason', ',', '▁I', '▁don', "'", 't', '▁play', '▁over', '▁there', '.', '▁B', ':', '▁Yeah', '.', '▁A', ':', '▁I', '▁like', '▁the', '▁course', ',', '▁but', '▁I', '▁don', "'", 't', '▁play', '▁over', '▁there', '▁because', ',', '▁they', '▁don', "'", 't', ',', '▁', 'u', 'h', ',', '▁you', '▁know', '▁don', "'", 't', '▁allow', '▁you', '▁to', '▁pull', '▁', 'a', '▁cart', '.', '▁B', ':', '▁Right', '.', '▁A', ':', '▁And', ',', '▁I', '▁don', "'", 't', '▁think', '▁', 'a', '▁cart', '▁damages', '▁the', '▁turf', '.', '▁Question', ':', '▁', 'a', '▁cart', '▁damages', '▁the', '▁turf', '▁', '?', '▁I', 's', '▁it'

logits:  tensor([[-1.1765e+01, -8.5831e-06, -1.4028e+01],
        [-7.4732e+00, -5.7786e-04, -1.1552e+01],
        [-6.6044e-05, -1.0101e+01, -1.0597e+01],
        [-1.2281e+01, -4.8876e-06, -1.5094e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([1, 1, 0, 1], device='cuda:0')
pred: tensor(1, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁Jean', '▁was', '▁tough', '▁and', '▁liked', '▁to', '▁drink', '.', '▁She', '▁would', '▁endure', '▁for', '▁', 'a', '▁long', '▁while', '▁yet', '.', '▁But', '▁what', '▁would', '▁she', '▁do', '▁when', '▁she', '▁realized', '▁that', '▁with', '▁things', '▁as', '▁they', '▁were', '▁she', '▁was', '▁on', '▁', 'a', '▁life', '▁sentence', '▁not', '▁just', '▁', 'a', '▁temporary', '▁suspension', '▁of', '▁essential', '▁pleasure', '?', '▁Question', ':', '▁Jean', '▁was', '▁on', '▁', 'a', '▁life', '▁sentence', '▁', '?', '▁I', 's', '▁it', '▁correct', '?', '<extra_id_0>', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

logits:  tensor([[-1.1275e+01, -1.3113e-05, -1.4811e+01],
        [-1.3518e+01, -1.6689e-06, -1.4950e+01],
        [-1.3544e+01, -1.6689e-06, -1.4895e+01],
        [-3.4571e-06, -1.3214e+01, -1.3310e+01]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([1, 1, 1, 0], device='cuda:0')
pred: tensor(1, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁A', ':', '▁Well', ',', '▁I', '▁don', "'", 't', '▁know', ',', '▁', 'u', 'h', ',', '▁I', '▁have', '▁', 'a', '▁hard', '▁time', '▁getting', ',', '▁', 'u', 'h', ',', '▁people', '▁on', '▁the', '▁telephone', '.', '▁B', ':', '▁Oh', '▁really', '.', '▁A', ':', '▁U', 'h', '-', 'huh', ',', '▁getting', '▁through', '▁to', '▁anybody', '.', '▁Sometimes', '▁I', '▁call', '▁off', '▁and', '▁on', '▁all', '▁day', ',', '▁B', ':', '▁Hu', 'h', '.', '▁A', ':', '▁but', '▁anyway', ',', '▁', 'u', 'h', ',', '▁I', '▁guess', '▁we', "'", 're', '▁supposed', '▁to', '▁be', '▁talking', '▁about', '▁family', '▁reunion', 's', '▁are', 'n', "'", 't', '▁we', '.', '▁Q

logits:  tensor([[-6.2229e-05, -1.0044e+01, -1.0884e+01],
        [-1.0772e+01, -2.6584e-05, -1.2087e+01],
        [-1.2323e+01, -8.9407e-06, -1.2322e+01],
        [-7.1409e-05, -1.0639e+01, -9.9551e+00]], device='cuda:0',
       grad_fn=<DivBackward0>)
labels:  tensor([0, 1, 1, 0], device='cuda:0')
pred: tensor(0, device='cuda:0') 
inputs:  torch.Size([4, 256])
['▁A', ':', '▁and', '▁', 'if', '▁they', '▁weren', "'", 't', '▁spending', '▁all', '▁the', '▁money', '▁on', '▁drug', '▁testing', ',', '▁people', '▁could', '▁have', '▁got', '▁', 'a', '▁raise', '.', '▁So', ',', '▁see', ',', '▁you', '▁know', ',', '▁there', "'", 's', '▁different', ',', '▁I', '▁think', '▁that', "'", 's', '▁more', '▁of', '▁', 'a', '▁personal', '▁view', '▁of', '▁mine', '▁other', '▁than', '▁', 'a', '▁yes', ',', '▁si', 'r', ',', '▁we', '▁should', '▁have', '▁drug', '▁testing', '▁because', '▁there', "'", 's', '▁really', '▁', 'a', '▁problem', '▁B', ':', '▁U', 'h', '-', 'huh', '.', '▁A', ':', '▁and', '▁I', '▁know', '▁that', '

In [1]:
# 1.1 mixed_template  url:https://github.com/thunlp/OpenPrompt/blob/main/tutorial/1.1_mixed_template.py

from datasets import load_dataset
raw_dataset = load_dataset('super_glue', 'cb', cache_dir="../datasets/.cache/huggingface_datasets")
raw_dataset['train'][0]


  utils.DeprecatedIn35,
Reusing dataset super_glue (../datasets/.cache/huggingface_datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/3 [00:00<?, ?it/s]

{'premise': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
 'hypothesis': 'the language was peeled down',
 'idx': 0,
 'label': 0}

In [2]:
from openprompt.data_utils import InputExample

dataset = {}
for split in ['train', 'validation', 'test']:
    dataset[split] = []
    for data in raw_dataset[split]:
        input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)
print(dataset['train'][0])

from openprompt.plms import load_plm

plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
  "text_b": "the language was peeled down",
  "tgt_text": null
}



  utils.DeprecatedIn35,


In [3]:
# 使用MixedTemplate，可以用{soft}来表示一个可调的template 具体相关概念可以自行搜索
from openprompt.prompts import MixedTemplate

mytemplate1 = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft": "Question:"} {"placeholder":"text_b"}? Is it correct? {"mask"}.')

mytemplate = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"mask"}.')


wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)


[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '', 'soft_token_ids': 1, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 2, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 3, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' the language was peeled down', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '', 'soft_token_ids': 4, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'soft_token_ids': 0, 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 0}], {'guid': 0, 'label': 0}]


In [4]:
wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
                        label_words=[["yes"], ["no"], ["maybe"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm
myverbalizer.process_logits(logits)

tokenizing: 250it [00:00, 343.73it/s]


Parameter containing:
tensor([[[4273]],

        [[ 150]],

        [[2087]]])


tensor([[-1.6852, -1.4578, -0.5415],
        [-2.2715, -1.1212, -0.5605]])

In [5]:
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

# ## below is standard training


from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()

no_decay = ['bias', 'LayerNorm.weight']

# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Using different optimizer for prompt parameters and model parameters
optimizer_grouped_parameters2 = [
    {'params': [p for n,p in prompt_model.template.named_parameters() if "raw_embedding" not in n]}
]

optimizer1 = AdamW(optimizer_grouped_parameters1, lr=1e-4)
optimizer2 = AdamW(optimizer_grouped_parameters2, lr=1e-3)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer1.step()
        optimizer1.zero_grad()
        optimizer2.step()
        optimizer2.zero_grad()
        print(tot_loss/(step+1))

# ## evaluate

# %%
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")


allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)



1.413600206375122
1.0612258911132812
0.9850967725118002
0.8783328384160995
0.9662164092063904
0.8626028199990591
0.9454956821032933
0.9703408256173134
0.9662239419089423
0.9114614963531494
0.90981140461835
0.8689070468147596
0.852993939931576
0.805360204407147
0.7953401148319245
0.7784849796444178
0.7391081965144943
0.7070387274854713
0.6926711268330875
0.7425214562565088
0.711621394824414
0.7300253263251348
0.7242054884200511
0.7027535975600282
0.7184759750962257
0.7274311497234381
0.7077179642187225
0.6892181499195951
0.6964782304291067
0.7047956181069215
0.6896308028409558
0.6769438742194325
0.6646323034709151
0.6552177661043757
0.6772534466215543
0.6588325233509144
0.6463159014647072
0.640822395486267
0.6251856757280154
0.61884994097054
0.6257700327692962
0.6146987985287394
0.6026702864572059
0.5908679230646654
0.5786888536479738
0.5664924554850744
0.5552989737467563
0.5444123907169948
0.534322806538976
0.5239627437293529
0.5141876362556336
0.5330581286062415
0.5259003191483471
0.5

0.00018176948486308737
0.00018554589981552758
0.00017689396386581042
0.00016976114394700353
0.00017831943284060496
0.00017092569737542362
0.00016163558029802516
0.00016304835420063077
0.00015548325618989397
0.00015201828800854856
0.00015284216325588723
0.0001498038715329709
0.00014956789219349562
0.0001459471675389068
0.00014307973040558864
0.0001465957427801466
0.00014672870862274
0.00014594501362807932
0.00014639515706974259
0.00014241830655616164
0.00014201326033094853
0.0001430023199304742
0.00014438112113125635
0.0001475645609817173
0.00014459156419824076
0.00014104864981668975
0.00014095789879934887
0.00014842931580604177
0.0001450651156912313
0.0001416992552776719
0.0001434205883126152
0.00014460303960038887
0.00014172809496389968
0.0001402530534629808
0.00014273148184050418
0.0001430776384515174
0.00014695941681179162
0.00015202984243008663
0.00015062522320198585
0.0001480728526621533
0.000152251768685899
0.00014963275839363632
0.00014882734287208942
0.0001486930839039107
0.000

tokenizing: 56it [00:00, 316.11it/s]


0.9285714285714286


In [1]:
# 2.1 conditonal_generation
# https://github.com/thunlp/OpenPrompt/blob/main/tutorial/2.1_conditional_generation.py
from openprompt.data_utils.conditional_generation_dataset import WebNLGProcessor
dataset = {}
dataset['train'] = WebNLGProcessor().get_train_examples("/root/datasets/CondGen/webnlg_2017/")
dataset['validation'] = WebNLGProcessor().get_dev_examples("/root/datasets/CondGen/webnlg_2017/")
dataset['test'] = WebNLGProcessor().get_test_examples("/root/datasets/CondGen/webnlg_2017/")

In [2]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm('t5', 't5-base')

  utils.DeprecatedIn35,


In [3]:
# 使用prefix_tuning_template
from openprompt.prompts.prefix_tuning_template import PrefixTuningTemplate
mytemplate = PrefixTuningTemplate(model=plm,  tokenizer=tokenizer, text=' {"placeholder":"text_a"} {"special": "<eos>"} {"mask"} ', using_decoder_past_key_values=False)

In [4]:
# 取一个训练数据作为样例观察template如何wrap
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': '  | Aarhus_Airport : cityServed : "Aarhus, Denmark"', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '<eos>', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': '0', 'tgt_text': 'The Aarhus is the airport of Aarhus, Denmark.'}]


In [10]:
from openprompt import PromptDataLoader
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=256,
    batch_size=5,shuffle=True, teacher_forcing=True, predict_eos_token=True, # be sure to pass predict_eos_token=True if your template doesn't contain one, or you model may fail to stop generation.
    truncate_method="head")

validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=256,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=True,
    truncate_method="head")

test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=256,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=True,
    truncate_method="head")

tokenizing: 18025it [00:38, 468.04it/s]
tokenizing: 872it [00:01, 579.13it/s]
tokenizing: 1862it [00:03, 578.81it/s]


In [11]:
# load the pipeline model PromptForGeneration.
from openprompt import PromptForGeneration
use_cuda = True
prompt_model = PromptForGeneration(plm=plm,template=mytemplate, freeze_plm=True,tokenizer=tokenizer, plm_eval_mode=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [12]:
from transformers import AdamW
# Follow PrefixTuning（https://github.com/XiangLi1999/PrefixTuning), we also fix the language model
# only include the template's parameters in training.

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
    "params": [p for n, p in mytemplate.named_parameters() if (not any(nd in n for nd in no_decay)) and p.requires_grad],
    "weight_decay": 0.0,
},
{
    "params": [p for n, p in mytemplate.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
    "weight_decay": 0.0,
},
]


optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)



In [9]:
from transformers.optimization import get_linear_schedule_with_warmup
import torch

tot_step  = len(train_dataloader)*5
scheduler = get_linear_schedule_with_warmup(optimizer, 0, tot_step)

# We provide generation a generation metric, you can also define your own. Note that it's not directly comparable to WebNLG's scripts evaluation.
from openprompt.utils.metrics import generation_metric
# Define evaluate function
def evaluate(prompt_model, dataloader):
    generated_sentence = []
    groundtruth_sentence = []
    prompt_model.eval()

    for step, inputs in enumerate(dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        _, output_sentence = prompt_model.generate(inputs, **generation_arguments)
        generated_sentence.extend(output_sentence)
        groundtruth_sentence.extend(inputs['tgt_text'])
    score = generation_metric(generated_sentence, groundtruth_sentence, "sentence_bleu")
    print("test_score", score, flush=True)
    return generated_sentence




generation_arguments = {
    "max_length": 512,
    "max_new_tokens": None,
    "min_length": 5,
    "temperature": 1.0,
    "do_sample": False,
    "top_k": 0,
    "top_p": 0.9,
    "repetition_penalty": 1.0,
    "num_beams": 5,
    "bad_words_ids": [[628], [198]]
}

# training and generation.
global_step = 0
tot_loss = 0
log_loss = 0
for epoch in range(5):
    prompt_model.train()
    for step, inputs in enumerate(train_dataloader):
        global_step +=1
        if use_cuda:
            inputs = inputs.cuda()
        loss = prompt_model(inputs)
        loss.backward()
        tot_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(mytemplate.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if global_step %5 ==0:
            print("Epoch {}, global_step {} average loss: {} lr: {}".format(epoch, global_step, (tot_loss-log_loss)/500, scheduler.get_last_lr()[0]), flush=True)
            log_loss = tot_loss

generated_sentence = evaluate(prompt_model, test_dataloader)
with open(f"Generated_sentence_webnlg_gpt2_False.txt",'w') as f:
    for i in generated_sentence:
        f.write(i+"\n")

TypeError: where(): argument 'other' (position 3) must be Tensor, not int

NameError: name 'torch' is not defined