In [13]:
# 1. 读取文件
# 2. 词元化 将文文本数据拆分成字符或者单词
# 3. 建立词表  词表包含单词或者字符的频率， 从id到单词或者字符的索引 数组表示，从单词或者字符的索引到单词 字典表示
#              文章的长度__len__  __getitem__ 返回token所在的索引  根据索引返回对应的单词或者字符 用<unk>表示空

import re 

def readfile():
    lines=[]
    with open("../data/timemachine.txt") as f:
        for line in f:
            lines.append(re.sub("[^A-Za-z]+",' ',line).strip().lower())
    return lines

lines=readfile()
for i in range(2):
    print(lines[i])


the time machine by h g wells



In [14]:
def tokenize(lines,type='word'):
    if type=='word':
        return [ line.split() for line in lines]
    elif type=='char':
        return [list(line) for line in lines]
    else:
        return None

tokens=tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [15]:
import collections

class Vocab():
    def __init__(self,tokens):
        words=[word for token in tokens for word in token]
        counter=collections.Counter(words)
        self._token_freq=sorted(counter.items(),key=lambda x: x[1],reverse=True)
        
        self.idx_to_token=['<unk>']
        self.token_to_idx={0:'<unk>'}
        for token,freq in self._token_freq:
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.token_to_idx.get(token,self.unk) for token in tokens]

    def to_tokens(self,indices):
        return [ self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        return 0

    @property
    def token_freq(self):
        return self._token_freq
    
vocab=Vocab(tokens)
print(vocab.token_freq[:10])
print(list(vocab.token_to_idx.items())[:10])
for i in range(10):
    print("文本：",tokens[i])
    print("索引：",vocab[tokens[i]])


[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
[(0, '<unk>'), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]
文本： ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引： [1, 19, 50, 40, 2183, 2184, 400]
文本： []
索引： []
文本： []
索引： []
文本： []
索引： []
文本： []
索引： []
文本： ['i']
索引： [2]
文本： []
索引： []
文本： []
索引： []
文本： ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
索引： [1, 19, 71, 16, 37, 11, 115, 42, 680, 6, 586, 4, 108]
文本： ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
索引： [7, 1420, 5, 2185, 587, 6, 126, 25, 330, 127, 439, 3]


In [16]:
def load_corpus_time_machine(maxtokens=10000):
    lines=readfile()
    tokens=tokenize(lines,'char')
    vocab=Vocab(tokens)

    corpus=[vocab[ch] for token in tokens for ch in token]
    if maxtokens>0:
        return corpus[:maxtokens],vocab
    return corpus,vocab

corpus,vocab=load_corpus_time_machine()
len(corpus),len(vocab)

(10000, 28)

In [17]:
import random
import torch

def seq_data_iter_random(corpus,batch_size,num_steps):

    corpus=corpus[random.randint(0,num_steps-1):]

    num_subseq=(len(corpus)-1) // num_steps

    seq_indices=list(range(0,num_subseq*num_steps,num_steps))

    random.shuffle(seq_indices)

    def data(pos):
        return corpus[pos:pos+num_steps]

    num_batch= len(seq_indices) // batch_size
    for i in range(0,num_batch*batch_size,batch_size):

        seq_sub_indices=seq_indices[i:i+batch_size]
        X=[data(j) for j in seq_sub_indices]
        Y=[data(j+1) for j in seq_sub_indices]
        yield torch.tensor(X),torch.tensor(Y)

    
myseq=list(range(35))
data_iter=seq_data_iter_random(myseq,2,5)
for x,y in data_iter:
    print(x,'\n',y)
           

tensor([[24, 25, 26, 27, 28],
        [ 4,  5,  6,  7,  8]]) 
 tensor([[25, 26, 27, 28, 29],
        [ 5,  6,  7,  8,  9]])
tensor([[29, 30, 31, 32, 33],
        [19, 20, 21, 22, 23]]) 
 tensor([[30, 31, 32, 33, 34],
        [20, 21, 22, 23, 24]])
tensor([[14, 15, 16, 17, 18],
        [ 9, 10, 11, 12, 13]]) 
 tensor([[15, 16, 17, 18, 19],
        [10, 11, 12, 13, 14]])


In [18]:
def seq_data_iter_seq(corpus,batch_size,num_steps):
    offset=random.randint(0,num_steps)
    num_total=((len(corpus)-offset-1)//batch_size)*batch_size
    xs=torch.tensor(corpus[offset:num_total+offset])
    ys=torch.tensor(corpus[offset+1:num_total+offset+1])
    xs,ys=xs.reshape(batch_size,-1),ys.reshape(batch_size,-1)
    num_batch=xs.shape[1]//num_steps
    for i in range(0,num_batch*num_steps,num_steps):
        x=xs[:,i:i+num_steps]
        y=ys[:,i:i+num_steps]
        yield x,y

data_iter=seq_data_iter_seq(myseq,2,5)
for x,y in data_iter:
    print(x,'\n',y)


tensor([[ 5,  6,  7,  8,  9],
        [19, 20, 21, 22, 23]]) 
 tensor([[ 6,  7,  8,  9, 10],
        [20, 21, 22, 23, 24]])
tensor([[10, 11, 12, 13, 14],
        [24, 25, 26, 27, 28]]) 
 tensor([[11, 12, 13, 14, 15],
        [25, 26, 27, 28, 29]])


In [19]:
class SeqData:
    def __init__(self,batch_size,num_steps,use_random_iter=False,maxtokens=10000):
        if use_random_iter:
            self.load_data_fn=seq_data_iter_random
        else:
            self.load_data_fn=seq_data_iter_seq
        self.batch_size=batch_size
        self.num_steps=num_steps
        self.corpus,self.vocab=load_corpus_time_machine(maxtokens)

    def __iter__(self):
        return self.load_data_fn(self.corpus,self.batch_size,self.num_steps)


data_iter=SeqData(32,35,)
len(list(data_iter))

8

In [20]:
from torch.nn import functional as F

X=torch.arange(10).reshape((2,5))
print(X)
test=F.one_hot(X.T,len(vocab))
print(test.shape)
print(test)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
torch.Size([5, 2, 28])
tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
#0 参数设定 隐藏层255 词大小28 步长5
#1 初始化模型参数 x 2*28  w_xh 28*255 w_hh 255*255 b_h 255  h_t 2*255   w_hq 255*28  b_q 28 
#2 初始化 h_0 参数  
#3 实现模型  h_t=tanh(x*w_xh+h_t-1*w_hh+b_h)  o=h_t*W_hq+b_q  
   # 原本x是批次大小*时间序列  转换成时间序列 批次大小 词表大小（独热编码的形式），的数据 在通过循环时间序列计算，最终将结果组合并拼接
class RNNModel:
   def __init__(self,vocab_size,hiddens_num):
      self.vocab_size=vocab_size
      self.inputs_num=self.vocab_size
      self.hiddens_num=hiddens_num
      self.outputs_num=self.vocab_size
      self.params=self.init_params()

   def init_params(self):
      #隐藏层
      w_xh=torch.randn(size=(self.inputs_num,self.hiddens_num))*0.01
      w_hh=torch.randn(size=(self.hiddens_num,self.hiddens_num))*0.01
      b_h=torch.zeros(self.hiddens_num)
      #输出层
      w_hq=torch.randn(size=(self.hiddens_num,self.outputs_num))*0.01
      b_q=torch.zeros(self.outputs_num)

      params=[w_xh,w_hh,b_h,w_hq,b_q]
      for p in params:
         p.requires_grad_(True)
      return params

   def begin_state(self,batch_size):
      return torch.zeros((batch_size,self.hiddens_num))
   
   def __call__(self,x,state):# 32*35 -> 35 * 32 *28
      xhot=F.one_hot(x.T,self.vocab_size).type(torch.float32)
      w_xh,w_hh,b_h,w_hq,b_q=self.params
      h=state
      outputs=[]
      for inx in xhot:
         #print(inx.shape,w_xh.shape,h.shape,w_hh.shape,b_h.shape)
         h=torch.tanh(torch.mm(inx,w_xh)+torch.mm(h,w_hh)+b_h)
         y=torch.mm(h,w_hq)+b_q
         outputs.append(y)
      #print(outputs)
      return torch.cat(outputs,dim=0),h


# batch_size=32
# num_steps=35
# hiddens_num=256
# train_iter,vocab=load_data_time_machine(batch_size,num_steps)
# xs,ys=next(iter(train_iter))
# print(xs.shape,ys.shape)
# net=RNNModel(len(vocab),hiddens_num)
# state=net.begin_state(batch_size)
# outputs,state=net(xs,state)
# print(outputs.shape,state.shape)


In [22]:
def pred_ch8(prefix,pred_num,net,vocab):
    state=net.begin_state(1)
    outputs=[vocab[prefix[0]]]
    get_input=lambda: torch.tensor([outputs[-1]]).reshape(1,1)
    for y in prefix[1:]: #预热
        _,state=net(get_input(),state)
        outputs.append(vocab[y])
    for _ in range(pred_num): #预测
        y,state=net(get_input(),state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

#pred_ch8("time traveller",10,net,vocab)


In [23]:
# 训练
# 1. 定义损失函数
# 2  定义梯度更新方式
# 3  预热数据
# 4  训练
from torch import nn

# 梯度裁剪  g<-min(1,theta/||g||)g
def grad_clipping(net,theta):
    params=net.params
    norm=torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm>theta:
        for p in params:
            p.grad[:]*= theta/norm

def sgd(params,lr):
    with torch.no_grad():
        for p in params:
            p-=lr*p.grad
            p.grad.zero_()

def train_epoch_ch8(net,loss,updater,train_iter,lr):
    state=None
    for X,Y in train_iter:
        if state==None:
            state=net.begin_state(X.shape[0])
        else:
            state.detach_()  #分离state
        y=Y.T.reshape(-1)
        y_hat,state=net(X,state)
        l=loss(y_hat,y.long()).mean()
        l.backward()
        grad_clipping(net,1)
        updater(net.params,lr)

def train_ch8(net,vocab,lr,epoch,train_iter):
    loss=nn.CrossEntropyLoss()
    updater=sgd
    predict=lambda prefix: pred_ch8(prefix,50,net,vocab)
    for _ in range(epoch):
        train_epoch_ch8(net,loss,updater,train_iter,lr)
    print(predict("time traveller"))
    print(predict("traveller"))


In [24]:
batch_size=32
num_steps=35
hiddens_num=512
lr=1
epoch=500
train_iter=SeqData(batch_size,num_steps)
#print(len(list(train_iter)))
vocab=train_iter.vocab
net=RNNModel(len(vocab),hiddens_num)
train_ch8(net,vocab,lr,epoch,train_iter)

time traveller for so it will be convenient to speak of himwas e
travelleryou can show black is white by argument said filby
