#准备数据

In [2]:
import torch
import zipfile
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.__version__)

cuda
1.8.0


In [3]:
with zipfile.ZipFile("../../data/jaychou_lyrics.txt.zip")as zin:
    with zin.open("jaychou_lyrics.txt") as f:
        lyrics=f.read().decode("utf-8")
lyrics=lyrics.replace("\n"," ").replace("\r"," ")
lyrics=lyrics[0:20000]

In [4]:
char_set=list(set(lyrics))
vocab_size=len(char_set)
char_indices=dict([(char,i)for i,char in enumerate(char_set)])
vocab_indices=[char_indices[char] for char in lyrics]

In [5]:
import numpy as np
import random

def Dataset_iter(num_steps,batch_size,data,mode=None):
    def get_data(pos,num_steps):
        return data[pos:pos+num_steps]
    num_data=len(data)
    num_samples=(num_data-1)//num_steps
    num_batch=num_samples//batch_size
    example_list=list(range(num_samples))
    if mode=="nearest":#相邻采样
        indices=np.array(example_list[0:num_batch*batch_size]).reshape(batch_size,num_batch)
        for i in range(num_batch):
            x=[get_data(idx*num_steps,num_steps)for idx in indices[:,i]]
            y=[get_data(idx*num_steps+1,num_steps)for idx in indices[:,i]]
            yield torch.from_numpy(np.array(x)),torch.from_numpy(np.array(y))        
    if mode=="random":#随机采样
        random.shuffle(example_list) 
        for epoch in range(num_batch):
            batch_pos=batch_size*epoch
            batch_indices=example_list[batch_pos:batch_pos+batch_size]
            x=[get_data(idx*num_steps,num_steps)for idx in batch_indices]
            y=[get_data(idx*num_steps+1,num_steps)for idx in batch_indices]
            yield torch.from_numpy(np.array(x)),torch.from_numpy(np.array(y)) 

In [6]:
def one_hot(x,length,dtype=torch.float32):
    x=x.long()
    zeros=torch.zeros((1,length),dtype=torch.float32,device=device)
    zeros[0,x]=1
    return zeros
def to_onehot(x,length,dtype=torch.float32):
    batch_size,num_steps=x.shape
    x=x.T
    res=[]
    for row in x:
        for t in row:
            res.append(one_hot(x,length,dtype=dtype))
    return torch.cat(res).reshape(num_steps,batch_size,length)

In [7]:
import torch.nn as nn
class rnn_model(nn.Module):
    def __init__(self,hidden_size,vocab_size,num_steps,*args):
        super(rnn_model,self).__init__()
        self.hidden_size=hidden_size
        self.vocab_size=vocab_size
        
        #此处h->vocab_size x hidden_size
        #RNN默认输入数据的格式是num_steps x batch_size x vocab_size
        #每次计算时取
        self.rnn=nn.RNN(input_size=vocab_size,hidden_size=hidden_size)
        self.state=None
        self.linear=nn.Linear(self.hidden_size,self.vocab_size)
    def forward(self,inputs,state):
        Y,self.state=self.rnn(inputs,state)
        Y=self.linear(Y.view(-1,Y.shape[2]))
        return Y,self.state

In [8]:
def grad_clipping(params, theta, device):
    norm = torch.tensor([0.0], device=device)
    for param in params:
        norm += (param.grad.data ** 2).sum()
    norm = norm.sqrt().item()#.item从具有一个元素的张量中取出元素值。
    if norm > theta:
        for param in params:
            param.grad.data *= (theta / norm)

In [9]:
def rnn_prediction(prefix,num_chars,model,vocab_size,device,char_set,char_indices):
    state=None
    output=[char_indices[prefix[0]]]
    for t in range(len(prefix)+num_chars):
        X=one_hot(torch.tensor([output[-1]]),vocab_size)
        if state is not None:
            if isinstance(state,tuple):
                state=(state[0].to(device),state[1].to(device))
            else:
                state=state.to(device)
        X=X.reshape(1,X.shape[0],X.shape[1])
        Y,state=model(X,state)
        if t < len(prefix) - 1:
            output.append(char_indices[prefix[t + 1]])
        else:
            output.append(int(Y.argmax(dim=1).item()))
    return ''.join([char_set[i] for i in output])

In [10]:
def predict_rnn(prefix,num_chars,params,state,rnn,num_hiddens,vocab_size,char_indices,char_set):
    output=[char_indices[prefix[0]]]
    params=get_params()
    for t in range(num_chars+len(prefix)):
        params=get_params()
        X=one_hot(torch.tensor([output[0]]),vocab_size)
        Y,state=rnn(x,state,params)
        if t<len(prefix)-1:
            output.append(char_indices[prefix[t+1]])
        else:
            output.append(torch.argmax(Y[0]).item())
    return ''.join([char_set[i] for i in output])

In [11]:
import math
import time
import torch.optim as optim
def train_and_predict_rnn(epoches,rnn_model,batch_size,num_steps,hidden_size,mode,theta,
                          device,vocab_indices,vocab_size,char_indices,char_set,prefixes,lr,momentum,num_chars):
    rnn=rnn_model(hidden_size,vocab_size,num_steps).to(device)
    state=None
    Loss=nn.CrossEntropyLoss()
    optimizer=optim.Adam(rnn.parameters(),lr=lr,weight_decay=0.0001)
    
    for epoch in range(1,epoches+1):
        start=time.time()
        data_iter=Dataset_iter(num_steps,batch_size,vocab_indices,mode)
        sum_loss=0.0
        n=0.0
        for x,y in data_iter:
            
            if state is not None:
                # 使用detach函数从计算图分离隐藏状态, 这是为了
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].detach(), state[1].detach())
                else:   
                    state = state.detach()
            optimizer.zero_grad()
            x=to_onehot(x,vocab_size)
            
            #当state==None时，此时会生成返回初始的state
            output,state=rnn(x.to(device),state)
            y=torch.transpose(y,0,1).contiguous().view(-1).to(device)
            
            #交叉熵损失得到的loss默认是平均的loss
            loss=Loss(output,y.long())
            #赋予参数各自的梯度
            loss.backward()
            #进行梯度裁剪（防止出现梯度爆炸）
            grad_clipping(rnn.parameters(), theta, device)
            optimizer.step()
            sum_loss+=loss.item()*y.shape[0]
            n+=y.shape[0]
           
        try:
            perplexity = math.exp(sum_loss/ n)
        except OverflowError:
            perplexity = float('inf')
        if (epoch + 1) % 50 == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, perplexity, time.time() - start))
            for prefix in prefixes:
                print(' -',rnn_prediction(prefix,num_chars,rnn,vocab_size,device,char_set,char_indices))

In [None]:
##seting superparameters
epoches=1000
batch_size=2
hidden_size=256
lr=1e-3
momentum=0.9
wight_decay=1e-2
mode="nearest"
theta=1e-2
prefixes=["分开","不分开"]
num_steps=32
num_chars=50
train_and_predict_rnn(epoches,rnn_model,batch_size,num_steps,hidden_size,mode,theta,
                          device,vocab_indices,vocab_size,char_indices,char_set,prefixes,lr,momentum,num_chars)

epoch 50, perplexity 31.472763, time 2.63 sec
 - 分开    话 所火  的的     武  在的  度  上  的火  说  的雕     快  切   
 - 不分开   上  在木     身    火 快      我上 在的    事   的火  快      
epoch 100, perplexity 24.127290, time 2.58 sec
 - 分开      我眼  的我时防七涌 泡的茶  在加的杂 加油学人  的脚  说擦眼 的剔棍忍唱着滴一永头
 - 不分开  的带 生 完板武  切的的   不惯忠像生切箱牵忆  对起       当的  我吸 现对 更一 
epoch 150, perplexity 25.837876, time 2.58 sec
 - 分开   不   念地脏感壶买轻引脚门擦休 诗窗窗油匠蒙阻滚买  波波擦壶 地度拳打脏现感元哈轻雨 今  
 - 不分开   忧    板诗装苛怀考问 做仗 尘部 哪 脚 装幅落落念 度 脏刻阻南 的  得欣  念弥再停 
epoch 200, perplexity 38.771297, time 2.58 sec
 - 分开    的 国武      的     漆着不     的 演 一          哮 望     
 - 不分开       又 跟 天的           河的                望的      形
epoch 250, perplexity 49.150232, time 2.58 sec
 - 分开                           不               道      烛
 - 不分开                                              式    
epoch 300, perplexity 56.602271, time 2.59 sec
 - 分开                                      切岁     坏     
 - 不分开                                           膀       
epoch 350, perplexity 70.0618