In [1]:
import time
import math
import numpy as np 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import d2lzh as d2l

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

corpus_indices,char_to_idx,idx_to_char,vocab_size = d2l.load_data_jay_lyrics()

print(idx_to_char[char_to_idx["分"]])
print(idx_to_char[25])

分
壕


## one-hot向量
为了将**词**表示成向量输入到网络中，一个简单的办法市使用one-hot向量。　假设词典中不同字符的个数为$N$（即词典的大小），每个字符已经和一个从0到$N-1$的连续整数索引一一对应。如果一个字符的索引市整数$i$，那么可以创建一个全０的长度为$N$的向量，并将其位置为$i$的元素设置为１．　该向量就是原字符的one-hot向量.

In [2]:
def one_hot(x,n_class,dtype=torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0],n_class,dtype=dtype,device=x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res

x = torch.tensor([0,2])
one_hot(x,vocab_size)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])

In [3]:
def to_onehot(x,n_class):
    # x shape:(batch,seq_len),
    # output: seq_len elements of (batch,n_class)

    return [one_hot(x[:,i],n_class) for i in range(x.shape[1])]

x = torch.arange(10).view(2,5)
inputs = to_onehot(x,vocab_size)
print(len(inputs),inputs[0].shape)

5 torch.Size([2, 1447])


In [4]:
num_inputs,num_hiddens,num_outputs = vocab_size,256,vocab_size
print("use ",device)

def get_params():

    def _one(shape):
        ts = torch.tensor(np.random.normal(0,0.01,size=shape),device=device,dtype=torch.float32)
        return torch.nn.Parameter(ts,requires_grad=True)
    
    w_xh = _one((num_inputs,num_hiddens))
    w_hh = _one((num_hiddens,num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens,device=device,requires_grad=True))

    #　输出层
    w_hq = _one((num_hiddens,num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs,device=device,requires_grad=True))

    return nn.ParameterList([w_xh,w_hh,b_h,w_hq,b_q])

use  cuda


In [5]:
def init_rnn_state(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),)

def rnn(inputs,state,params):
    w_xh,w_hh,b_h,w_hq,b_q = params 
    H, = state
    outputs = []
    for x in inputs:
        H = torch.tanh(torch.matmul(x,w_xh) + torch.matmul(H,w_hh) + b_h)
        y = torch.matmul(H,w_hq) + b_q
        outputs.append(y)
    return outputs,(H,)

state = init_rnn_state(x.shape[0],num_hiddens,device)
inputs = to_onehot(x.to(device),vocab_size)
params = get_params()
outputs,state_new = rnn(inputs,state,params)
print(len(outputs),outputs[0].shape,state_new[0].shape)

5 torch.Size([2, 1447]) torch.Size([2, 256])


## 预测函数
基于前缀`prefix`（含有数个字符的字符串）来预测接下来的`num_chars`个字符。

In [6]:
def predict_rnn(prefix,num_chars,rnn,params,init_rnn_state,num_hiddens,vocab_size,device,idx_to_char,char_to_idx):

    state = init_rnn_state(1,num_hiddens,device)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # 将上一个时间步的输出作为下一个时间步的输入
        x = to_onehot(torch.tensor([[output[-1]]],device=device),vocab_size)
        (y,state) = rnn(x,state,params) #计算和更新隐藏状态
        #下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
        if t < len(prefix)-1:
            output.append(char_to_idx[prefix[t+1]])
        else:
            output.append(int(y[0].argmax(dim=1).item())) 
    return " ".join([idx_to_char[i] for i in output])

In [7]:
predict_rnn("分开",10,rnn,params,init_rnn_state,num_hiddens,vocab_size,device,idx_to_char,char_to_idx)


'分 开 练 散 载 至 腾 仪 箱 记 熬 屉'

In [8]:
def grad_clipping(params,theta,device):
    norm = torch.tensor([0.0],device=device)
    for param in params:
        norm += (param.grad.data ** 2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta/norm)


In [9]:
def train_and_predict_rnn(rnn,get_params,init_rnn_state,num_hiddens,vocab_size,device,corpus_indices,idx_to_char,char_to_idx,is_random_iter,num_epochs,num_steps,lr,clipping_theta,batch_size,pred_period,pred_len,prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive

    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter: # 使用相邻采样，在epoch开始时初始化隐藏层
            state = init_rnn_state(batch_size,num_hiddens,device)
        l_sum,n,start = 0.0,0,time.time()
        data_iter = data_iter_fn(corpus_indices,batch_size,num_steps,device)
        for x,y in data_iter:
            #　随机采样，在每个小批量更新钱初始化隐藏状态
            if is_random_iter:
                state = init_rnn_state(batch_size,num_hiddens,device) 
            else:
                for s in state:
                    s.detach_()
            inputs = to_onehot(x,vocab_size)
            (outputs,state) = rnn(inputs,state,params)
            outputs = torch.cat(outputs,dim=0)
            y = torch.transpose(x,0,1).contiguous().view(-1)
            l = loss(outputs,y.long())

            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params,clipping_theta,device)
            d2l.sgd(params,lr,1)

            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print("epoch %d,perplexity %f,time %.2f sec" % (epoch + 1,math.exp(l_sum / n),time.time() - start))

            for prefix in prefixes:
                print(" -",predict_rnn(prefix,pred_len,rnn,params,init_rnn_state,num_hiddens,vocab_size,device,idx_to_char,char_to_idx))

In [10]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 10, ['分开']

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, device, corpus_indices, idx_to_char,
                      char_to_idx, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

epoch 50,perplexity 3.565956,time 0.23 sec
 - 分 开 开 始 口 边 拜 淡 油 啦 愿 东
epoch 100,perplexity 1.346147,time 0.24 sec
 - 分 开 开 始 口 边 弥 淡 淡 … 产 鸣
epoch 150,perplexity 1.091614,time 0.23 sec
 - 分 开 开 始 口 语 比 方 方 平 产 得
epoch 200,perplexity 1.037620,time 0.23 sec
 - 分 开 开 始 口 边 天 天 天 天 天 天
epoch 250,perplexity 1.016120,time 0.23 sec
 - 分 开 开 始 口 口 比 方 方 安 和 和
