In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import json
import re
from pathlib import Path
import opencc
from torch.utils.data import Dataset, DataLoader

In [7]:
converter = opencc.OpenCC("t2s")

def sentenceParse(para):
    para = re.sub(r"（.*?）", "", para)
    para = re.sub(r"{.*?}", "", para)
    para = re.sub(r"《.*?》", "", para)
    para = re.sub(r"[\[\]]", "", para)
    para = "".join([s for s in para if s not in "0123456789-"])
    para = re.sub(r"。。", "。", para)
    para = converter.convert(para)
    if "𫗋" in para:
        return ""
    return para


def parseRawData(author=None, constrain=None):
    def handleJson(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        rst = []
        for poetry in data:
            if author and poetry.get("author") != author:
                continue

            paragraphs = poetry.get("paragraphs")
            if any(
                len(tr) != constrain and len(tr) != 0
                for s in paragraphs
                for tr in re.split("[，！。]", s)
                if constrain is not None
            ):
                continue

            pdata = "".join(paragraphs)
            pdata = sentenceParse(pdata)
            if pdata:
                rst.append(pdata)
        return rst

    data = []
    src_path = Path("./data/chinese-poetry-master/全唐诗/")
    for file_path in src_path.glob("poet.tang*"):
        data.extend(handleJson(file_path))
    # for file_path in src_path.glob("poet.song*"):
        # data.extend(handleJson(file_path))
    return data

In [8]:
data = parseRawData()  # All if author=None
# Uncomment the next line for specific author data and shuffle
# data = dataHandler.parseRawData(author="李白", constrain=5); random.shuffle(data)
# for s in data:
# print(s)
word_to_idx = {}

for sent in data:
    for word in sent:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
word_to_idx["<EOP>"] = len(word_to_idx)
word_to_idx["<START>"] = len(word_to_idx)

VOCAB_SIZE = len(word_to_idx)

idx_to_word = {v: k for k, v in word_to_idx.items()}

print("VOCAB_SIZE:", VOCAB_SIZE)
print("data_size", len(data))
print("data[0]:", data[0])

VOCAB_SIZE: 8936
data_size 57595
data[0]: 秦川雄帝宅，函谷壮皇居。绮殿千寻起，离宫百雉余。连甍遥接汉，飞观迥凌虚。云日隐层阙，风烟出绮疎。


In [14]:
print(data[:1])

['秦川雄帝宅，函谷壮皇居。绮殿千寻起，离宫百雉余。连甍遥接汉，飞观迥凌虚。云日隐层阙，风烟出绮疎。']


In [15]:
file_path = "./data/train.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

paragraphs = [para.strip() for para in text.split("<|endoftext|>") if para.strip()]

print(paragraphs[:1])

['菩萨蛮 其一\n小山重叠金明灭，鬓云欲度香腮雪。\n懒起画蛾眉，弄妆梳洗迟。\n照花前后镜，花面交相映。\n新帖绣罗襦，双双金鹧鸪。']


In [None]:
# char_to_id = {}
# id_to_char = {}


# # 遍历数据，更新字符映射
# chars = sorted(set(text))
# char_to_id = {ch: i + 2 for i, ch in enumerate(chars)}
# id_to_char = {i + 2: ch for i, ch in enumerate(chars)}

# char_to_id["<pad>"] = 0
# char_to_id["<eos>"] = 1
# id_to_char[0] = "<pad>"
# id_to_char[1] = "<eos>"

# vocab_size = len(char_to_id)
# print("字典大小: {}".format(vocab_size))

In [10]:
# # df["char_id_list"] = df["Comment"].apply(
# # lambda text: [char_to_id[char] for char in list(text)] + [char_to_id["<eos>"]]
# # )
# # df.head()

char_id_lists = []
for item in data:
    char_ids = [word_to_idx[char] for char in item] + [word_to_idx["<EOP>"]]
    char_id_lists.append(char_ids)

print(char_id_lists[:5])

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 5, 17, 18, 19, 20, 21, 11, 22, 23, 24, 25, 26, 5, 27, 28, 29, 30, 31, 11, 32, 33, 34, 35, 36, 5, 37, 38, 39, 12, 40, 11, 8934], [41, 42, 43, 44, 45, 5, 46, 47, 48, 49, 50, 11, 51, 52, 53, 54, 55, 5, 56, 57, 58, 59, 60, 11, 61, 62, 63, 64, 65, 5, 66, 67, 68, 69, 70, 11, 71, 72, 73, 74, 75, 5, 76, 77, 28, 78, 79, 11, 8934], [80, 81, 39, 82, 83, 5, 84, 85, 86, 87, 88, 11, 89, 90, 91, 92, 93, 5, 94, 95, 96, 97, 98, 11, 99, 100, 101, 31, 102, 5, 103, 104, 105, 106, 107, 11, 108, 109, 110, 111, 112, 5, 113, 114, 73, 115, 116, 11, 8934], [117, 118, 119, 120, 121, 5, 122, 123, 124, 125, 126, 11, 106, 127, 128, 129, 102, 5, 130, 131, 132, 133, 134, 11, 135, 59, 136, 137, 138, 5, 139, 140, 141, 142, 143, 11, 144, 114, 145, 146, 147, 5, 148, 149, 150, 151, 152, 11, 8934], [125, 153, 154, 155, 156, 5, 157, 158, 159, 111, 160, 11, 161, 162, 163, 26, 164, 5, 165, 166, 25, 32, 167, 11, 38, 168, 169, 34, 170, 5, 171, 172, 173, 174, 175, 11, 1

In [11]:
batch_size = 8
epochs = 5
embed_dim = 50
hidden_dim = 30
lr = 0.001
grad_clip = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("now using device: ", device)

now using device:  cuda


In [12]:
class Dataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        # x = self.sequences.iloc[index][:-1]
        # y = self.sequences.iloc[index][1:]
        x = self.sequences[index][:-1]
        y = self.sequences[index][1:]
        return x, y


def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])

    pad_batch_x = nn.utils.rnn.pad_sequence(
        batch_x, batch_first=True, padding_value=word_to_idx["<START>"]
    )

    pad_batch_y = nn.utils.rnn.pad_sequence(
        batch_y, batch_first=True, padding_value=word_to_idx["<START>"]
    )

    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [13]:
# dataset = Dataset(df["char_id_list"])
dataset = Dataset(char_id_lists)

In [14]:
data_loader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
for batch in data_loader:
    print(type(batch))
    print()
    # print("batch:", batch)

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'>

<class 'tuple'

In [None]:
class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        # initialize the weights
        self.W_xh = np.random.randn(hidden_size, input_size)
        self.W_hh = np.random.randn(hidden_size, hidden_size)
        self.W_hy = np.random.randn(output_size, hidden_size)
        # initialize the hidden state
        self.h = np.zeros((hidden_size, 1))

    def step(self, x):
        # update the hidden state
        self.h = np.tanh(np.dot(self.W_hh, self.h) + np.dot(self.W_xh, x))
        # compute the output vector
        y = np.dot(self.W_hy, self.h)
        return y

In [15]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=word_to_idx["<START>"],
        )

        self.rnn_layer1 = nn.LSTM(
            input_size=embed_dim, hidden_size=hidden_dim, batch_first=True
        )

        self.rnn_layer2 = nn.LSTM(
            input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True
        )

        self.linear = nn.Sequential(
            nn.Linear(in_features=hidden_dim, out_features=hidden_dim),
            nn.ReLU(),
            nn.Linear(in_features=hidden_dim, out_features=vocab_size),
        )

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)

        batch_x_lens = batch_x_lens.cpu()
        batch_x = nn.utils.rnn.pack_padded_sequence(
            batch_x, batch_x_lens, batch_first=True, enforce_sorted=False
        )

        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)

        batch_x, _ = nn.utils.rnn.pad_packed_sequence(batch_x, batch_first=True)

        batch_x = self.linear(batch_x)

        return batch_x

    def generator(self, start_str, max_len=50, top_n=5):
        char_list = [word_to_idx[char] for char in list(start_str)]
        next_char = None

        while len(char_list) < max_len:
            x = torch.LongTensor(char_list).unsqueeze(0)
            x = self.embedding(x)
            _, (ht, _) = self.rnn_layer1(x)
            _, (ht, _) = self.rnn_layer2(ht)
            y = self.linear(ht.squeeze(0))

            # 获取前 top_n 大的字符的索引
            top_n_values, top_n_indices = torch.topk(y, top_n)
            top_n_indices = top_n_indices.cpu().numpy()

            if top_n > 1:
                next_char = np.random.choice(top_n_indices[0])
            else:
                next_char = top_n_indices[0][0]

            if next_char == word_to_idx["<EOP>"]:
                break

            char_list.append(next_char)

        return [idx_to_word[ch_id] for ch_id in char_list]

In [16]:
torch.manual_seed(2)
model = CharRNN(VOCAB_SIZE, embed_dim, hidden_dim)
criterion = torch.nn.CrossEntropyLoss(
    ignore_index=word_to_idx["<START>"], reduction="mean"
)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [17]:
def train(model, num_epochs, data_loader, optimizer, criterion, vocab_size, grad_clip=1.0):
    ###################
    # 训练 #
    ###################
    min_loss = np.Inf
    model.train()
    for epoch in range(1, epochs + 1):
        model = model.to(device)
        for batch_idx, (batch_x, batch_y, batch_x_lens, batch_y_lens) in enumerate(data_loader):
            optimizer.zero_grad()

            # 将数据移动到GPU
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            # batch_x_lens = batch_x_lens.to(device)
            # batch_y_lens = batch_y_lens.to(device)

            batch_pred_y = model(batch_x, batch_x_lens)

            batch_pred_y = batch_pred_y.view(-1, vocab_size)
            batch_y = batch_y.view(-1)

            loss = criterion(batch_pred_y, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            optimizer.step()

            if not batch_idx % 100:
                print(
                    f"Epoch: {epoch:03d}/{num_epochs:03d} | Batch {batch_idx:05d}/{len(data_loader):05d} | Loss: {loss:.4f}"
                )

        torch.save(model.state_dict(), "char_rnn_model.pth")
        # 每个epoch结束后进行生成测试
        with torch.no_grad():
            model.eval()
            model.cpu()
            generated_text = model.generator("月")
            print("".join(generated_text))
            model.train()

        torch.cuda.empty_cache()


train(model, epochs, data_loader, optimizer, criterion, VOCAB_SIZE)
# model.load_state_dict(torch.load("char_rnn_model.pth"))

Epoch: 001/005 | Batch 0000/7200 | Loss: 9.0922
Epoch: 001/005 | Batch 0100/7200 | Loss: 6.7448
Epoch: 001/005 | Batch 0200/7200 | Loss: 6.7190
Epoch: 001/005 | Batch 0300/7200 | Loss: 6.3978
Epoch: 001/005 | Batch 0400/7200 | Loss: 6.4915


OutOfMemoryError: CUDA out of memory. Tried to allocate 814.00 MiB (GPU 0; 3.80 GiB total capacity; 1.62 GiB already allocated; 107.75 MiB free; 1.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [22]:
with torch.no_grad():
    for i in range(10):
        print("".join(model.generator("月", top_n=1)))
        print()
    for i in range(10):
        print("".join(model.generator("我")))
        print()
    for i in range(10):
        print("".join(model.generator("日")))
        print()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
torch.save(model.state_dict(), "char_rnn_model.pth")