In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import time
import random
import json
from matplotlib import pyplot as plt

In [2]:
seed = 9
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

num_epochs = 500
batch_size = 128
max_length = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("now using", device)

now using cuda


In [3]:
with open("poems.json", "r", encoding="utf-8") as f:
    poems = json.load(f)

with open("vocab.json", "r", encoding="utf-8") as f:
    word_to_index = json.load(f)

index_to_word = {index: word for word, index in word_to_index.items()}
vocab_size = len(word_to_index)

print("VOCAB_SIZE:", vocab_size)
print("data_size", len(poems))

# 将句子转换为列表形式，并添加结束符
poems = [list(poem) + ["<EOP>"] for poem in poems]
index_tensors = {
    word: torch.LongTensor([word_to_index[word]]) for word in word_to_index
}

VOCAB_SIZE: 3482
data_size 1287


In [4]:
def generate_sample(sequence):

    inputs = [index_tensors[sequence[i - 1]] for i in range(1, len(sequence))]
    outputs = [index_tensors[sequence[i]] for i in range(1, len(sequence))]

    # 将输入和输出列表合并为张量
    encoded_inputs = torch.cat(inputs)
    encoded_outputs = torch.cat(outputs)

    return encoded_inputs, encoded_outputs


class PoetryDataset(Dataset):
    def __init__(self, poems, transform=None):
        self.poems = poems
        self.transform = transform

    def __len__(self):
        return len(self.poems)

    def __getitem__(self, index):
        poem = self.poems[index]
        input_data, output_data = generate_sample(poem)
        if self.transform:
            input_data = self.transform(input_data)
        return input_data, output_data


def custom_collate_fn(batch):
    sequences, targets = zip(*batch)
    # 统一长度以进行批处理
    padded_sequences = nn.utils.rnn.pad_sequence(
        sequences, batch_first=True, padding_value=word_to_index["<START>"]
    )
    padded_targets = nn.utils.rnn.pad_sequence(
        targets, batch_first=True, padding_value=word_to_index["<START>"]
    )
    return padded_sequences, padded_targets


dataset = PoetryDataset(poems)
data_loader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn
)

In [5]:
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim
        # 在循环神经网络（RNN）中
        # 当前时刻的隐藏状态是由当前时刻的输入和上一个时刻的隐藏状态共同决定的。
        self.input_to_hidden = nn.Linear(input_dim + hidden_dim, hidden_dim)
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.input_to_hidden(combined))
        return hidden

In [6]:
class PoetryModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = RNN(embedding_dim, hidden_dim)
        self.linear1 = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden):
        embeds = self.embeddings(input)
        batch_size, seq_len, _ = embeds.size()
        outputs = []
        for i in range(seq_len):
            hidden = self.rnn(embeds[:, i, :], hidden)
            outputs.append(hidden)
        rnn_out = torch.stack(outputs, dim=1)
        output = self.linear1(F.relu(rnn_out.contiguous().view(-1, self.hidden_dim)))
        output = self.softmax(output)
        output = output.view(batch_size, seq_len, -1)
        return output, hidden

    def initHidden(self, device, batch_size=1):
        return torch.zeros(batch_size, self.hidden_dim).to(device)

In [7]:
def train(model, num_epochs, data_loader, optimizer, criterion, scheduler, vocab_size):
    log_dict = {
        "train_loss_per_epoch": [],
        "train_perplexity_per_epoch": [],
    }
    model.train()
    model.to(device)
    start_time = time.time()
    for epoch in range(num_epochs):
        current_lr = optimizer.param_groups[0]["lr"]
        print(
            f"Epoch: {epoch+1:03d}/{num_epochs:03d} | Current Learning Rate: {current_lr:.6f}"
        )
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(data_loader):
            model.zero_grad()
            hidden = model.initHidden(device=device, batch_size=inputs.size(0))
            output, hidden = model(inputs.to(device), hidden)

            # print(output.shape, targets.shape)
            # torch.Size([16, 120, 3482]) torch.Size([16, 120])
            # print(output.view(-1, vocab_size).shape, targets.view(-1).shape)
            # torch.Size([1920, 3482]) torch.Size([1920])
            # 使用view函数调整输出和目标的形状以匹配损失函数的期望输入
            # output的原始形状是[批次大小, 序列长度, 词汇表大小]，targets的原始形状是[批次大小, 序列长度]
            # view(-1, vocab_size)将output重塑为[批次大小*序列长度, 词汇表大小]，以匹配每个时间步的预测
            # targets通过view(-1)被重塑为[批次大小*序列长度]，这样每个预测都有一个对应的目标值
            loss = criterion(output.view(-1, vocab_size), targets.view(-1).to(device))
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * inputs.size(0)

            if not batch_idx % 50:
                print(
                    f"Epoch: {epoch + 1:03d}/{num_epochs:03d} | Batch {batch_idx + 1:05d}/{len(data_loader):05d} | Loss: {loss:.4f}"
                )

        avg_loss = total_loss / len(data_loader.dataset)
        scheduler.step(avg_loss)
        perplexity = torch.exp(torch.tensor(avg_loss))
        log_dict["train_loss_per_epoch"].append(avg_loss)
        log_dict["train_perplexity_per_epoch"].append(perplexity)

        print(f"Time elapsed: {(time.time() - start_time) / 60:.2f} min")

    torch.save(model.state_dict(), "model_state_dict.pth")
    print(f"Total Training Time: {(time.time() - start_time)/ 60:.2f} min")
    return log_dict

In [8]:
def plot_training_stats(log_dict):
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.plot(log_dict["train_loss_per_epoch"], label="Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss")
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(log_dict["train_perplexity_per_epoch"], label="Training Perplexity")
    plt.xlabel("Epoch")
    plt.ylabel("Perplexity")
    plt.grid(True)
    plt.title("Training Perplexity")
    plt.savefig("training_stats.svg")
    plt.show()

model = PoetryModel(vocab_size=len(word_to_index), embedding_dim=256, hidden_dim=512)

optimizer = optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_index["<START>"], reduction="mean")
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=9, verbose=True
)
# log_dict = train(
#     model, num_epochs, data_loader, optimizer, criterion, scheduler, vocab_size
# )
# plot_training_stats(log_dict)
model.load_state_dict(torch.load("model_state_dict.pth"))
model.to(device)

PoetryModel(
  (embeddings): Embedding(3482, 256)
  (rnn): RNN(
    (input_to_hidden): Linear(in_features=768, out_features=512, bias=True)
    (tanh): Tanh()
  )
  (linear1): Linear(in_features=512, out_features=3482, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [11]:
def generate_text(start_word="<START>", top_k=1, log=False):
    generated_text = ""
    index_tensors_list = []
    for word in start_word:
        index_tensors_list.append(index_tensors[word].unsqueeze(0))
        generated_text += word

    hidden_state = model.initHidden(device=device)
    with torch.no_grad():

        for _ in range(max_length - len(generated_text)):
            input_tensor = torch.tensor(index_tensors_list).unsqueeze(0).to(device)

            output, hidden_state = model(input_tensor.to(device), hidden_state)
            last_word = output[:, -1, :]
            last_word = last_word.view(-1)
            top_values, top_indices = last_word.data.topk(top_k)

            probabilities = torch.exp(top_values)
            top_words = [index_to_word[index.item()] for index in top_indices]

            probabilities_np = probabilities.cpu().detach().numpy()
            probabilities_np = probabilities_np / probabilities_np.sum()
            indices_np = top_indices.cpu().detach().numpy()
            if log:
                for word, prob in zip(top_words, probabilities_np):
                    print(f"{word}: {prob:.4f}")

            selected_index = np.random.choice(indices_np, p=probabilities_np)

            next_word = index_to_word[selected_index]
            if next_word == "<EOP>":
                break
            if log:
                print(generated_text)
            # * 需要升一个维
            index_tensors_list = [index_tensors[next_word]]
            generated_text += next_word

    return generated_text.strip()


print(generate_text("长安一片月", top_k=1))
print(generate_text("江", top_k=3))
print(generate_text("月", top_k=3))
print(generate_text("泉", top_k=3))
print(generate_text("日", top_k=30))
print(generate_text("风", top_k=3, log=True))

长安一片月，万户捣衣声。秋风吹不尽，总是玉关情。何日平胡虏，良人罢远征。
江海多豪气，朝廷有直声。何言马蹄下，一旦是佳城。
月皎昭阳殿，霜清长信宫。天行乘玉辇，飞燕与君同。
泉眼不清光殿人来，绮中有酒声。一餐不平意，无情终与期。清景既为山，天地同沙石。虎当风日好，云外遶碧峰。至今还山上，黄叶落岩霜。路向高僧望，云绕万重风。
日暮景太粉，花房春水流。天涯一挥桃，发我欲南襟。十二吟轻命，楚臣亦满阴。君王不可见，志宠有松列。吾将相交去，回首醉酒船。每不能老手，百下泪赏东。出入无花好，宛人来尚游。二年非玉手，白日夜月色。江风调醉日，流粉同归心。昔时亦无心，缅然皆乡天。一壶余阴起，万古
烟: 0.6155
吹: 0.2331
露: 0.1514
风
纪: 0.8845
里: 0.0661
起: 0.0494
风烟
南: 0.9528
江: 0.0261
海: 0.0212
风烟纪
城: 0.9897
山: 0.0065
都: 0.0038
风烟纪南
，: 0.9954
。: 0.0034
头: 0.0012
风烟纪南城
尘: 0.9759
水: 0.0173
旌: 0.0069
风烟纪南城，
土: 0.9593
水: 0.0221
户: 0.0186
风烟纪南城，尘
荆: 0.9791
青: 0.0123
今: 0.0086
风烟纪南城，尘土
门: 0.9935
青: 0.0041
城: 0.0024
风烟纪南城，尘土荆
路: 0.9905
城: 0.0054
东: 0.0041
风烟纪南城，尘土荆门
。: 1.0000
，: 0.0000
劒: 0.0000
风烟纪南城，尘土荆门路
天: 0.9942
相: 0.0031
江: 0.0027
风烟纪南城，尘土荆门路。
寒: 0.9707
山: 0.0196
河: 0.0097
风烟纪南城，尘土荆门路。天
多: 0.5725
猎: 0.3919
不: 0.0356
风烟纪南城，尘土荆门路。天寒
猎: 0.8564
兽: 0.0836
谷: 0.0600
风烟纪南城，尘土荆门路。天寒多
骑: 0.9712
字: 0.0154
会: 0.0134
风烟纪南城，尘土荆门路。天寒多猎
，: 1.0000
全: 0.0000
门: 0.0000
风烟纪南城，尘土荆门路。天寒多猎骑
走: 0.9347
海: 0.0377
桥: 0.027