In [1]:
import numpy as np
import torch
import transformers
import re
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = np.load("data/clear_data.npy")

In [None]:
hidden_poem_data = []
for item in data:
    sens = re.split(r'[,.，。！？]', item)
    length = len(sens[0])
    hidden_word = [s[0] if s else '' for i, s in enumerate(sens) if i % 2 == 0]
    hidden_poem_data.append(f'{length}{"".join(hidden_word)}->{item}')


In [None]:
hidden_poem_data[1000]

In [None]:
np.save("data/hidden_poems.npy", hidden_poem_data)

In [None]:
candidate_texts = [
    "床 前 明 月 光", # 模型生成的第一首诗
    "白 日 依 山 尽 黄 河 入 海 流", # 模型生成的第二首诗
    # ... 更多生成的诗
]

reference_texts = [
    "床 前 明 月 光 疑 是 地 上 霜", # 对应的第一首参考诗
    "白 日 依 山 尽 黄 河 入 海 流 欲 穷 千 里 目 更 上 一 层 楼", # 对应的第二首参考诗
    # ... 更多参考诗
]

In [3]:
import evaluate

# 加载 ROUGE 评估器
rouge_metric = evaluate.load('rouge')

def char_tokenizer(text):
    return list(text.replace(" ", "")) # 移除所有空格，然后按字切分

# 为了让 rouge_score 正确处理中文（即使是字级别），最好还是给它分好词的列表
# 所以，我们将每个字作为列表中的一个元素
tokenized_predictions_char = [[char for char in text.replace(" ", "")] for text in candidate_texts]
tokenized_references_char = [[char for char in text.replace(" ", "")] for text in reference_texts]

# 计算字级别 ROUGE
# 注意：evaluate的rouge期望predictions是字符串列表，references是字符串列表或字符串列表的列表
# 它内部会进行分词。我们可以通过 tokenizer 参数指定分词方式。
# 如果不指定tokenizer，它会使用空格分词。对于没有空格的中文，每个字会变成一个token。

print("--- 字级别 ROUGE ---")
results_char = rouge_metric.compute(
    predictions=candidate_texts,
    references=reference_texts,
    tokenizer=char_tokenizer, 
)
print(f"字级别 ROUGE-1: {results_char['rouge1']:.4f}") # 通常看 F1 分数
print(f"字级别 ROUGE-2: {results_char['rouge2']:.4f}")
print(results_char) # 打印详细结果，包含 precision, recall, fmeasure


ImportError: cannot import name 'Dataset' from 'datasets' (/home/wuwen/project_python/poem-generate/datasets.py)

In [None]:
data[0]

In [38]:
POEM_NUMBER = 100
prompts1 = []
prompts2 = []
prompts3 = []
candidate_texts = [] # 实际的诗歌
reference_texts = []


for item in data[:POEM_NUMBER]:
    sens = re.split(r'[,.!，。！？]', item)
    if len(sens) <3:
        continue
    prompts1.append(item[:len(sens[0])+1])
    prompts2.append(item[:len(''.join(sens[:2]))+2])
    prompts3.append(item[:len(''.join(sens[:3]))+3])
    candidate_texts.append(item)

In [None]:
i = 3
print(prompts1[i])
print(prompts2[i])
print(prompts3[i])
print(candidate_texts[i])

In [None]:
greedy_poems = []
beam_poems = []
top_k_poems =[]
top_p_poems = []


device = torch.device('cuda')
saved = torch.load(f'checkpoints/20_20-0.001-sonnet.pt', weights_only=False)
tokenizer = BertTokenizer.from_pretrained("cache/bert-tokenizer", local_files_only=True)
model = PoemGPT(saved['args'], tokenizer)
model.load_state_dict(saved['model'])
model = model.to(device)
model.eval()


Trainable tensors: 200/200 (100.00%)
Trainable parameters: 118906760/118906760 (100.00%)


PoemGPT(
  (gpt): GPT2Model(
    (word_embedding): Embedding(21128, 768, padding_idx=0)
    (pos_embedding): Embedding(1024, 768)
    (embed_dropout): Dropout(p=0.1, inplace=False)
    (gpt_layers): ModuleList(
      (0-11): 12 x GPT2Layer(
        (self_attention): CausalSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attention_dense): Linear(in_features=768, out_features=768, bias=True)
        (attention_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention_dropout): Dropout(p=0.1, inplace=False)
        (interm_dense): Linear(in_features=768, out_features=2304, bias=True)
        (out_dense): Linear(in_features=2304, out_features=768, bias=True)
        (out_layer_norm): LayerNorm((768,), eps=1e-05, elem

In [None]:
for i in range(POEM_NUMBER):
    encoding1 = tokenizer(prompts1[i], return_tensors='pt', padding=False, truncation=True).to(device) #(bs,sl) 词索引阶段
    encoding2 = tokenizer(prompts2[i], return_tensors='pt', padding=False, truncation=True).to(device) #(bs,sl) 词索引阶段
    encoding3 = tokenizer(prompts3[i], return_tensors='pt', padding=False, truncation=True).to(device) #(bs,sl) 词索引阶段
    # 需要把toenizer后面添加的特殊Token [SEP] 去掉
    # tok-k
    t = []
    token_ids1, generated_output1 = model.generate_top_k(encoding1['input_ids'][:, :-1],temperature=1.2,k_size=10)
    token_ids2, generated_output2 = model.generate_top_k(encoding2['input_ids'][:, :-1],temperature=1.2,k_size=10)
    token_ids3, generated_output3 = model.generate_top_k(encoding3['input_ids'][:, :-1],temperature=1.2,k_size=10)
    top_k_poems.append([generated_output1, generated_output2, generated_output3])
    # tok-q
    token_ids, generated_output1 = model.generate_top_q(encoding1['input_ids'][:, :-1],temperature=1.2, top_p=0.9)
    token_ids, generated_output2 = model.generate_top_q(encoding2['input_ids'][:, :-1],temperature=1.2, top_p=0.9)
    token_ids, generated_output3 = model.generate_top_q(encoding3['input_ids'][:, :-1],temperature=1.2, top_p=0.9)
    top_p_poems.append([generated_output1, generated_output2, generated_output3])
    # greedy search
    token_ids, generated_output1 = model.generate_greedy_search(encoding1['input_ids'][:, :-1])
    token_ids, generated_output2 = model.generate_greedy_search(encoding2['input_ids'][:, :-1])
    token_ids, generated_output3 = model.generate_greedy_search(encoding3['input_ids'][:, :-1])
    greedy_poems.append([generated_output1, generated_output2, generated_output3])
    # beam search
    token_ids, generated_output1 = model.generate_beam_search(encoding1['input_ids'][:, :-1])
    token_ids, generated_output2 = model.generate_beam_search(encoding2['input_ids'][:, :-1])
    token_ids, generated_output3 = model.generate_beam_search(encoding3['input_ids'][:, :-1])
    beam_poems.append([generated_output1, generated_output2, generated_output3])