In [1]:

import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
import os

# 语料


## 创建语料处理对象


In [81]:
# 从语料中创建词典
class Dictionary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)


# 语料处理
class Corpus:
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        # 从语料创建词典
        with open(path, 'r') as f:
            num_tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                num_tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # 将语料向量化
        vector = torch.LongTensor(num_tokens)
        ind = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    vector[ind] = self.dictionary.word2idx[word]
                    ind += 1
        num_batches = vector.size(0) // batch_size
        vector = vector[:num_batches * batch_size]
        return vector.view(batch_size, -1)

## 创建输入数据

In [82]:
corpus = Corpus()
datasets = corpus.get_data('../datasets/train.txt', batch_size=20)


# 词汇表的大小
vocab_size = len(corpus.dictionary)

# 数据批次
num_batches = len(data_loader)

In [83]:
len(datasets)

20

In [85]:
sample = datasets[1, :10]
sample
for ind in sample.numpy():
    print(f"{ind} --> {corpus.dictionary.idx2word[ind]}")

93 --> that
718 --> would
590 --> so
1569 --> load
35 --> a
4979 --> carrier
95 --> up
87 --> with
507 --> debt
93 --> that


# 创建模型

In [2]:
# 参数
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# 定义模型
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x = self.embed(x)
        out, (h, c) = self.lstm(x, h)
        batch, seq, feature = out.shape
        out = out.reshape(batch * seq, feature)
        out = self.linear(out)
        return out, (h, c)
    


In [6]:
vocab_size = 1000
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)
model

RNNLM(
  (embed): Embedding(1000, 128)
  (lstm): LSTM(128, 1024, batch_first=True)
  (linear): Linear(in_features=1024, out_features=1000, bias=True)
)

In [94]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [95]:
def detach(states):
    return [state.detach() for state in states]

In [96]:
for epoch in range(num_epochs):
    states = [
        torch.zeros(num_layers, batch_size, hidden_size).to(device),
        torch.zeros(num_layers, batch_size, hidden_size).to(device)
    ]
    for i in range(0, datasets.size(1) - seq_length, seq_length):
        inputs = datasets[:, i:i + seq_length].to(device)
        targets = datasets[:, (i + 1):(i + 1 + seq_length)].to(device)

        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i + 1) // seq_length
        if step % 100 == 0:
            print(
                'Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                .format(epoch + 1, num_epochs, step, num_batches, loss.item(),
                        np.exp(loss.item())))

Epoch [1/5], Step[0/46479], Loss: 9.2147, Perplexity: 10043.44
Epoch [1/5], Step[100/46479], Loss: 5.9966, Perplexity: 402.08
Epoch [1/5], Step[200/46479], Loss: 5.8979, Perplexity: 364.28
Epoch [1/5], Step[300/46479], Loss: 5.7438, Perplexity: 312.26
Epoch [1/5], Step[400/46479], Loss: 5.6847, Perplexity: 294.33
Epoch [1/5], Step[500/46479], Loss: 5.1114, Perplexity: 165.90
Epoch [1/5], Step[600/46479], Loss: 5.1789, Perplexity: 177.50
Epoch [1/5], Step[700/46479], Loss: 5.3557, Perplexity: 211.82
Epoch [1/5], Step[800/46479], Loss: 5.1976, Perplexity: 180.83
Epoch [1/5], Step[900/46479], Loss: 5.0826, Perplexity: 161.19
Epoch [1/5], Step[1000/46479], Loss: 5.0941, Perplexity: 163.06
Epoch [1/5], Step[1100/46479], Loss: 5.2947, Perplexity: 199.27
Epoch [1/5], Step[1200/46479], Loss: 5.1567, Perplexity: 173.60
Epoch [1/5], Step[1300/46479], Loss: 5.0658, Perplexity: 158.50
Epoch [1/5], Step[1400/46479], Loss: 4.8253, Perplexity: 124.62
Epoch [1/5], Step[1500/46479], Loss: 5.1224, Perpl

In [97]:
# Test the model
with torch.no_grad():
# with open('sample.txt', 'w') as f:
    # Set intial hidden ane cell states
    state = (torch.zeros(num_layers, 1, hidden_size).to(device),
             torch.zeros(num_layers, 1, hidden_size).to(device))

    # Select one word id randomly
    prob = torch.ones(vocab_size)
    input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

    for i in range(num_samples):
        # Forward propagate RNN 
        output, state = model(input, state)

        # Sample a word id
        prob = output.exp()
        word_id = torch.multinomial(prob, num_samples=1).item()

        # Fill input with sampled word id for the next time step
        input.fill_(word_id)

        # File write
        word = corpus.dictionary.idx2word[word_id]
        word = '\n' if word == '<eos>' else word + ' '
        print(word)
#         f.write(word)

#         if (i+1) % 100 == 0:
#             print('Sampled [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))

produce 
gains 
enough 
for 
one-third 
the 
next 
buy-out 
purchased 


a 
spokesman 
for 
the 
<unk> 
the 
head 
of 
a 
new 
york 
financier 
ronald 
<unk> 
tv 
<unk> 
diseases 
a 
potential 
block 
corp. 
<unk> 
conn 
to 
become 
hurt 
<unk> 
knight 
co. 
which 
stood 
at 
ford 
motor 
united 
press 
over 
the 
<unk> 
islands 


picks 
how 
can 
be 
used 
in 
a 
company 
controlled 
by 
osha 
for 
the 
international 
and 
<unk> 
projects 


osha 
led 
to 
a 
bid 
by 
$ 
N 
million 
an 
equity 
loss 
incurred 
to 
about 
$ 
N 
million 
from 
<unk> 
assets 
a 
year 
earlier 


the 
company 
is 
once 
we 
see 
it 
works 
would 
be 
owned 
by 
workers 
and 
will 
start 
to 
workers 
who 
do 
n't 
belong 
on 
the 
sessions 
he 
said 


anheuser 
said 
the 
new 
structure 
would 
continue 
home 
the 
fed 
holds 
down 
at 
least 
for 
them 
<unk> 
if 
other 
tests 
had 
had 
to 
egg 
the 
changes 


goodson 
also 
said 
that 
soviet 
<unk> 
could 
n't 
press 
a 
bad 
trade 


we 
believe 


# 数据预处理

In [1]:
import os
import json
import re
import random
import numpy as np

In [5]:
# 查看数据文件
data_path = "../../H/datasets/chinese_potery/"

# 随机选择宋词文件
file_name = ''
while not file_name.startswith('poet.tang'):
    file_name = random.choice(os.listdir(data_path))
    print(file_name)
sample_path = data_path + file_name

sample = json.loads(open(sample_path, 'r').read())
print(type(sample), len(sample))  # 读取的文件形式为字典组成的列表
print(random.choice(sample))  # 键 paragraphs 对应的值为目标数据

poet.song.173000.json
poet.tang.12000.json
<class 'list'> 1000
{'author': '錢起', 'paragraphs': ['煙渚復煙渚，畫屏休畫屏。', '引愁天末去，數點暮山青。'], 'title': '江行無題一百首 三十六', 'id': '955b7180-969b-4698-a5ef-d2a5c96451d2'}


## 文本预处理

In [None]:
class Corpus:
    def __init__(self, config):
        self.config = config
        self.data_path = config.data_path
        self.category = config.category
        self.author = config.author
        self.constarin = config.constrain
        
        self.raw_data = self.parse_raw_data()
        self.word2idx = {}
        self.idx2word = {}

        
    ############ 获取文件中文本，并预处理 #########################################
    def parse_raw_data(self):
        data = []
        for file in os.listdir(self.data_path):
            if file.startswith(self.category):
                data.extend(self.parse_file(file, self.author, self.constrain))
        return data

    def parse_file(self, file, author, constrain):
        res = []
        data = json.loads(open(file).read())
        for poetry in data:
            content = " "
            if author is not None and poetry['author'] != author:
                continue

            para = poetry['paragraphs']  # 将诗歌行组成的列表组成完整的长串
            for line in para:
                content += line

            content = self.parse_poetry(content)
            if content != " " and len(content) > 1:
                res.append(content)
        return res

    def parse_poetry(self, content):
        # 去掉诗歌内容括号中的部分，即注释
        # para = "-181-村橋路不端，數里就迴湍。積壤連涇脉，高林上笋竿。早嘗甘蔗淡，生摘琵琶酸。（「琵琶」，嚴壽澄校《張祜詩集》云：疑「枇杷」之誤。）好是去塵俗，煙花長一欄。"
        result, number = re.subn("（.*）", "", para)
        result, number = re.subn("{.*}", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("[\]\[]", "", result)

        # 去掉数字
        r = ""
        for s in result:
            if s not in [
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'
            ]:
                r += s
        # 处理两个句号为1个句号
        r, number = re.subn("。。", "。", r)

        # 返回预处理好的文本
        return r
    
    ############ 文本向量化并处理成等长 #########################################
    def vectorize(self):
        data = self.raw_data
        # 创建字典
        chars = {c for line in data for c in line}
        self.word2idx = {char: ix for ix, char in enumerate(chars)}
        self.word2idx['<EOP>'] = len(self.word2idx)
        self.word2idx['<START>'] = len(self.word2idx)
        self.word2idx['</s>'] = len(self.word2idx)
        self.idx2word = {idx: char for char, idx in self.word2idx.items()}

        # 添加起始标志
        for i in range(0, len(self.raw_data)):
            data[i] = ['<START>'] + list(data[i]) + ['<EOP>']

        # 文本数值化，并等长处理
        data_id = [[self.word2idx[w] for w in line] for line in data]
        pad_data = pad_sequences(data_id,
                                 maxlen=config.poetry_maxlen,
                                 padding='pre',
                                 truncating='post',
                                 value=len(char_to_ix) - 1)
        np.savez_compressed(config.processed_data_path,
                            data=pad_data,
                            word2ix=char2ix,
                            ix2word=ix_to_chars)
        return pad_data


In [56]:
def parse_raw_data(data_path, category, author, constrain):
    def sentence_parse(para):
        # 去掉括号中的部分
        # para = "-181-村橋路不端，數里就迴湍。積壤連涇脉，高林上笋竿。早嘗甘蔗淡，生摘琵琶酸。（「琵琶」，嚴壽澄校《張祜詩集》云：疑「枇杷」之誤。）好是去塵俗，煙花長一欄。"
        result, number = re.subn("（.*）", "", para)
        result, number = re.subn("{.*}", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("[\]\[]", "", result)

        # 去掉数字
        r = ""
        for s in result:
            if s not in [
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'
            ]:
                r += s
        # 处理两个句号为1个句号
        r, number = re.subn("。。", "。", r)

        # 返回预处理好的文本
        return r

    def handle_json(file):
        rst = []
        data = json.loads(open(file).read())
        for poetry in data:
            pdata = " "

            # 获取指定作者
            if author is not None and poetry.get("author") != author:
                continue

            p = poetry.get("paragraphs")
            flag = False
            for s in p:
                sp = re.split("[，！。]", s)
                for tr in sp:
                    if constrain is not None and len(tr) != constrain and len(
                            tr) != 0:
                        flag = True
                        break
                    if flag:
                        break
            if flag:
                continue
            for sentence in poetry.get("paragraphs"):
                pdata += sentence
            pdata = sentence_parse(pdata)
            if pdata != "" and len(pdata) > 1:
                rst.append(pdata)
        return rst

    data = []
    for filename in os.listdir(data_path):
        if filename.startswith(category):
            data.extend(handle_json(data_path + filename))
    return data

In [57]:
data_path = "../../H/datasets/chinese_potery/"
category = "poet.tang"
author = None
constrain = None

data = parse_raw_data(data_path, category, author, constrain)
data

[' 歌敲玉唾壺，醉擊珊瑚枝。石羊妙善街，甘露平泉碑。捫苔想豪傑，剔蘚看文詞。歸來北固山，水檻光參差。',
 ' 去年今夜江南別，鴛鴦翅冷飛蓬爇。今年今夜江北邊，鯉魚腸斷音書絕。男兒心事無了時，出門上馬不自知。',
 ' 勝景天然別，精神入畫圖。一山分四頂，三面瞰平湖。過夏僧無熱，凌冬草不枯。遊人來至此，願剃髪和鬚。',
 ' 臨塘古廟一神仙，繡幌花容色儼然。爲逐朝雲來此地，因隨暮雨不歸天。眉分初月湖中鑑，香散餘風竹上煙。借問邑人沈水事，已經秦漢幾千年。',
 ' 朱邸平臺隔禁闈，貴遊陳跡尚依稀。雲低雍畤祈年去，雨細長楊從獵歸。申白賓朋傳道義，應劉文彩寄音徽。承平舊物惟君盡，猶寫雕鞍伴六飛。',
 ' 破暗長明世代深，煙和香氣兩沈沈。不知初點何人在，秪見當年火至今。曉似紅蓮開沼面，夜如寒月鎮潭心。孤光自有龍神護，雀戲蛾飛不敢侵。',
 ' 艱難別離久，中外往還深。已改當時髪，空餘舊日心。',
 ' 累年無的信，每夜望邊城。袖掩千行淚，書封一尺金。',
 ' 自到西川住，惟君別有情。常逢對門遠，又隔一重城。',
 ' 巧畫蛾眉獨出羣，當時人道便承恩。經年不見君王面，落日黃昏空掩門。',
 ' 涇溪石險人競懼，終歲不聞傾覆人。却是平流無石處，時時聞說有沈淪。',
 ' 楚水悠悠浸楚亭，楚南天地兩無情。忍交孫武重泉下，不見時人說用兵。',
 ' 十二三年就試期，五湖煙月奈相違。何如買取胡孫弄，一笑君王便著緋。',
 ' 呂望當年展廟謨，直鉤釣國更誰如。若教生在西湖上，也是須供使宅魚。',
 ' 也知有意吹噓切，爭奈人間善惡分。但是粃糠細微物，等閑擡舉到青雲。',
 ' 牆下濃陰對此君，小山尖險玉爲羣。夜來解凍風雖急，不向寒城減一分。',
 ' 暖氣潛催次第春，梅花已謝杏花新。半開半落閑園裏，何異榮枯世上人。',
 ' 簷前飛雪扇前塵，千里移添上苑春。他日丁寧柿林院，莫宣恩澤與閑人。',
 ' 餘聲宛宛拂庭梅，通濟渠邊去又回。若使煬皇魂魄在，爲君應合過江來。',
 ' 黃土原邊狡兔肥，犬如流電馬如飛。灞陵老將無功業，猶憶當時夜獵歸。',
 ' 夏窗七葉連陰暗。賴家橋上潏河邊。細看月輪真有意，已知青桂近嫦娥。一箇禰衡容不得，思量黃祖謾英雄。張華謾出如丹語，不及劉侯一紙書。山雨霏微宿上亭，雨中因想雨淋鈴。老僧齋罷關門睡，不管波濤四面生。',
 ' 姓字看侵尺五天，芳菲占斷百花鮮

In [60]:
def pad_sequences(sequences,
                  maxlen=None,
                  dtype='int32',
                  padding='pre',
                  truncating='pre',
                  value=0.):
    if not hasattr(sequences, '__len__'):
        raise ValueError("sequences must be iterable")
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError(
                'sequences must be a list of iterables. Found non-iterable: ' +
                str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = max(lengths)

    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' %
                             truncating)

        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                "shape of sample %s of sequence at position %s is different from expected shape %s"
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError("padding type %s not understood" % padding)
    return x

In [None]:
def get_data(config):
    data = parse_raw_data(config.data_path, config.category, config.authot,
                          config.constrain)

    chars = {c for line in data for c in line}
    char_to_ix = {char: ix for ix, char in enumerate(chars)}
    char_to_ix['<EOP>'] = len(char_to_ix)
    char_to_ix['<START>'] = len(char_to_ix)
    char_to_ix['</s>'] = len(char_to_ix)
    ix_to_chars = {ix: char for char, ix in list(char_to_ix.items())}

    for i in range(0, len(data)):
        data[i] = ['<START>'] + list(data[i]) + ['<EOP>']

    data_id = [[char_to_ix[w] for w in line] for line in data]

    pad_data = pad_sequences(data_id,
                             maxlen=config.poetry_maxlen,
                             padding='pre',
                             truncating='post',
                             value=len(char_to_ix) - 1)
    np.savez_compressed(config.processed_data_path,
                        data=pad_data,
                        word2ix=char2ix,
                        ix2word=ix_to_chars)
    return pad_data, char_to_ix, ix_to_chars

In [None]:
class Config(object):
    data_path = "../../H/datasets/chinese_potery/"
    category = "poet.tang"
    author = None
    constrain = None
    poetry_max_len = 125
    sample_max_len = poetry_max_len - 1
    processed_data_path = "data/tang.npz"
    word_dict_path = 'wordDic'
    
    model_path = 'model/tang_200.pth'
    model_prefix = 'model/tang'

    batch_size = 128
    epoch_num = 201

    embedding_dim = 256
    hidden_dim = 256
    layer_num = 2  # rnn的层数
    lr = 0.01
    weight_decay = 1e-4

    plot_every = 2
    debug_file = '/tmp/debugp'
    env = 'poetry'

    use_gpu = False

    max_gen_len = 200  # 生成诗歌最长长度
    sentence_max_len = 4  # 生成诗歌的最长句子

    prefix_words = '细雨鱼儿出,微风燕子斜。'  # 不是诗歌的组成部分，用来控制生成诗歌的意境
    start_words = '闲云潭影日悠悠'  # 诗歌开始
    acrostic = False  # 是否是藏头诗

In [None]:
config = Config()
pad_data, char_to_ix, ix_to_char = get_data(config)

for l in pad_data[:10]:
    print(l)
class Config(object):
    data_path = "./json/"
    category = "poet.tang"
    author = None
    constrain = None
    poetry_max_len = 125
    sample_max_len = poetry_max_len-1
    processed_data_path = "data/tang.npz"
    word_dict_path = 'wordDic'
    model_path = 'model/tang_200.pth'
    model_prefix = 'model/tang'

    batch_size = 128
    epoch_num = 201

    embedding_dim = 256
    hidden_dim = 256
    layer_num = 2  # rnn的层数
    lr = 0.01
    weight_decay = 1e-4

    plot_every = 2
    debug_file = '/tmp/debugp'
    env = 'poetry'

    use_gpu = False

    max_gen_len = 200  # 生成诗歌最长长度
    sentence_max_len = 4 # 生成诗歌的最长句子

    prefix_words = '细雨鱼儿出,微风燕子斜。'  # 不是诗歌的组成部分，用来控制生成诗歌的意境
    start_words = '闲云潭影日悠悠'  # 诗歌开始
    acrostic = False  # 是否是藏头诗

In [None]:
config = Config()
pad_data, char_to_ix, ix_to_chars = get_data(config)
for l in pad_data[:10]:
    print(l)

n = 0
for k, v in char_to_ix.items():
    print(k, v)
    if n > 10:
        break
    n += 1

n = 0
for k, v in ix_to_chars.items():
    print(k, v)
    if n > 10:
        break
    n += 1

# 创建模型

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [25]:
class LM(nn.Module):
    def __init__(vocab_size, embedding_dim, hidden_size, num_layers, device):
        """
        embedding --> lstm(dropout) --> linear(dropout) --> relu
        """
        super(LM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=False,  # batch维度默认位于第二位，需要手动指定
        )
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.2)  
        self.device = device

    def forward(self, inputs, hidden):
        seq_len, batch_size = inputs.size()  # batch维度不位于第一位

        embeds = self.embedding(inputs)  # seq,batch,embed_dim

        output, hidden = self.lstm(embeds, hidden)  # output:seq,batch,hidden
        output = output.view(seq_len * batch_size, -1)  # seq*batch,hidden
        
        output = self.dropout(output) # 线性层之前使用 dropout

        output = F.relu(self.linear(output))  # seq*batch,vocab_size
        return output, hidden

    def init_hidden(self, num_layers, batch_size):
        return (Variable(torch.zeros(num_layers, batch_size,
                                     self.hidden_size)),
                Variable(torch.zeros(num_layers, batch_size,
                                     self.hidden_size)))

# 训练模型

In [80]:
class TrainModel:
    def __init__(self):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        self.config = Config()
        self.device = torch.device(
            'cuda') if self.config.use_gpu else torch.device('cpu')

    def train(self, data_loader, model, optimizer, criterion, char_to_ix,
              ix_to_chars):
        for epoch in range(self.config.num_epoches):
            for step, x in enumerate(data_loader):
                x = x.long().transpose(1, 0).contiguous()
                x = x.to(self.device)
                optimizer.zero_grad()
                input_, target = x[:-1, :], x[1:, :]
                target = target.view(-1)

                hidden = model.initial_hidden(self.config.num_layers,
                                              x.size()[1])

                output, _ = model(input_, hidden)
                loss = criterion(output, target)

                loss.backward()

                optimizer.step()

                if step % 200 == 0:
                    print("epoch: %d, loss: %f" % (epoch, loss.data))

            if epoch % 1 == 0:
                torch.save(model.state_dict(),
                           "%s_%s.pth" % (self.config.model_prefix, epoch))

                word = '床前明月光'
                gen_poetry = ''.join(
                    self.generate_head_test(model, word, char_to_ix,
                                            ix_to_chars))
                print(gen_poetry)

    def run(self):
        data, char_to_ix, ix_to_chars = get_data(self.config)
        vocab_size = len(char_to_ix)
        print("样本数：%d" % len(data))
        print("字典大小：%d" % vocab_size)

        data = torch.from_numpy(data)
        data_loader = Data.DataLoader(data,
                                      batch_size=self.config.batch_size,
                                      shuffle=True,
                                      num_works=1)

        model = LM(vocab_size=vocab_size,
                   embedding_dim=self.config.embedding_dim,
                   hidden_size=self.config.hidden_size,
                   num_layers=self.config.num_layers,
                   device=self.device)
        model.to(self.device)

        optimizer = nn.optim.Adam(model.parameters(),
                                  lr=self.config.lr,
                                  weight_decay=self.config.weight_decay)
        criterion = nn.CrossEntropyLoss()

        self.train(data_loader, model, optimizer, criterion, char_to_ix,
                   ix_to_chars)

    def generate_head(self, model, head_sentence, word_to_ix, ix_to_word):
        poetry = []
        head_char_len = len(head_sentence)
        sentence_len = 0
        pre_char = '<START>'

        input_ = (torch.Tensor([word_to_ix['START']
                                ]).view(1, 1).long()).to(self.device)
        hidden = model.init_hidden(self.config.layer_num, 1)

        top_index = output.data[0].topk(1)[1][0].item()
        char = ix_to_word[top_index]

        for i in range(self.config.max_gen_len):
            # 前向计算出概率最大的当前词
            output, hidden = model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = ix_to_word[top_index]

            # 句首的字用藏头字代替
            if pre_char in ['。', '！', '<START>']:
                if sentence_len == head_char_len:
                    break
                else:
                    char = head_sentence[sentence_len]
                    sentence_len += 1
                    input = (input.data.new([word_to_ix[char]])).view(1, 1)
            else:
                input = (input.data.new([top_index])).view(1, 1)

            poetry.append(char)
            pre_char = char

        return poetry
    
model = TrainModel()
model.run()

# 利用模型

In [None]:
class Sample(object):
    def __init__(self):
        self.config = Config()
        self.device = torch.device(
            'cuda') if self.config.use_gpu else torch.device('cpu')

        self.processed_data_path = self.config.processed_data_path
        self.model_path = self.config.model_path
        self.max_len = self.config.max_gen_len
        self.sentence_max_len = self.config.sentence_max_len

        self.load_data()
        self.load_model()

    def load_data(self):
        if os.path.exists(self.processed_data_path):
            data = np.load(self.processed_data_path)
            self.data, self.word_to_ix, self.ix_to_word = data['data'], data[
                'word2ix'].item(), data['ix2word'].item()

    def load_model(self):
        model = PoetryModel(len(self.word_to_ix), self.config.embedding_dim,
                            self.config.hidden_dim, self.device,
                            self.config.layer_num)
        map_location = lambda s, l: s
        state_dict = torch.load(self.config.model_path,
                                map_location=map_location)
        model.load_state_dict(state_dict)
        model.to(self.device)
        self.model = model

    def generate_random(self, start_words='<START>'):
        """自由生成一首诗歌"""
        poetry = []
        sentence_len = 0

        input = (torch.Tensor([self.word_to_ix[start_words]
                               ]).view(1, 1).long()).to(self.device)
        hidden = self.model.init_hidden(self.config.layer_num, 1)

        for i in range(self.max_len):
            # 前向计算出概率最大的当前词
            output, hidden = self.model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = self.ix_to_word[top_index]

            # 遇到终结符则输出
            if char == '<EOP>':
                break

            # 有8个句子则停止预测
            if char in ['。', '！']:
                sentence_len += 1
                if sentence_len == 8:
                    poetry.append(char)
                    break

            input = (input.data.new([top_index])).view(1, 1)
            poetry.append(char)

        return poetry

    def generate_head(self, head_sentence):
        """生成藏头诗"""
        poetry = []
        head_char_len = len(head_sentence)  # 要生成的句子的数量
        sentence_len = 0  # 当前句子的数量
        pre_char = '<START>'  # 前一个已经生成的字

        # 准备第一步要输入的数据
        input = (torch.Tensor([self.word_to_ix['<START>']
                               ]).view(1, 1).long()).to(self.device)
        hidden = self.model.init_hidden(self.config.layer_num, 1)

        for i in range(self.max_len):
            # 前向计算出概率最大的当前词
            output, hidden = self.model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = self.ix_to_word[top_index]

            # 句首的字用藏头字代替
            if pre_char in ['。', '！', '<START>']:
                if sentence_len == head_char_len:
                    break
                else:
                    char = head_sentence[sentence_len]
                    sentence_len += 1
                    input = (input.data.new([self.word_to_ix[char]
                                             ])).view(1, 1)
            else:
                input = (input.data.new([top_index])).view(1, 1)

            poetry.append(char)
            pre_char = char

        return poetry

    def generate_poetry(self, mode=1, head_sentence=None):
        """
        模式一：随机生成诗歌
        模式二：生成藏头诗
        模式三：给定首句生成诗
        :return:
        """
        poetry = ''
        if mode == 1 or (mode == 2 and head_sentence is None):
            poetry = ''.join(self.generate_random())
        if mode == 2 and head_sentence is not None:
            head_sentence = head_sentence.replace(',', u'，').replace(
                '.', u'。').replace('?', u'？')
            poetry = ''.join(self.generate_head(head_sentence))

        return poetry