In [22]:
# 1. 读取文件
# 2. 词元化 将文文本数据拆分成字符或者单词
# 3. 建立词表  词表包含单词或者字符的频率， 从id到单词或者字符的索引 数组表示，从单词或者字符的索引到单词 字典表示
#              文章的长度__len__  __getitem__ 返回token所在的索引  根据索引返回对应的单词或者字符 用<unk>表示空

import re 

def readfile():
    lines=[]
    with open("../data/timemachine.txt") as f:
        for line in f:
            lines.append(re.sub("[^A-Za-z]+",' ',line).strip().lower())
    return lines

lines=readfile()
for i in range(2):
    print(lines[i])


the time machine by h g wells



In [23]:
def tokenize(lines,type='word'):
    if type=='word':
        return [ line.split() for line in lines]
    elif type=='char':
        return [list(line) for line in lines]
    else:
        return None

tokens=tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [24]:
import collections

class Vocab():
    def __init__(self,tokens):
        words=[word for token in tokens for word in token]
        counter=collections.Counter(words)
        self._token_freq=sorted(counter.items(),key=lambda x: x[1],reverse=True)
        
        self.idx_to_token=['<unk>']
        self.token_to_idx={0:'<unk>'}
        for token,freq in self._token_freq:
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        return [self.token_to_idx.get(token,self.unk) for token in tokens]

    def to_tokens(self,indices):
        return [ self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        return 0

    @property
    def token_freq(self):
        return self._token_freq
    
vocab=Vocab(tokens)
print(vocab.token_freq[:10])
print(list(vocab.token_to_idx.items())[:10])
for i in range(10):
    print("文本：",tokens[i])
    print("索引：",vocab[tokens[i]])


[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
[(0, '<unk>'), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]
文本： ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引： [1, 19, 50, 40, 2183, 2184, 400]
文本： []
索引： []
文本： []
索引： []
文本： []
索引： []
文本： []
索引： []
文本： ['i']
索引： [2]
文本： []
索引： []
文本： []
索引： []
文本： ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
索引： [1, 19, 71, 16, 37, 11, 115, 42, 680, 6, 586, 4, 108]
文本： ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
索引： [7, 1420, 5, 2185, 587, 6, 126, 25, 330, 127, 439, 3]


In [25]:
def load_corpus_time_machine():
    lines=readfile()
    tokens=tokenize(lines,'char')
    vocab=Vocab(tokens)

    corpus=[vocab[ch] for token in tokens for ch in token]
    return corpus,vocab

corpus,vocab=load_corpus_time_machine()
len(corpus),len(vocab)

(170580, 28)

In [26]:
import random
import torch

def seq_data_iter_random(corpus,batch_size,num_steps):

    corpus=corpus[random.randint(0,num_steps-1):]

    num_subseq=(len(corpus)-1) // num_steps

    seq_indices=list(range(0,num_subseq*num_steps,num_steps))

    random.shuffle(seq_indices)

    def data(pos):
        return corpus[pos:pos+num_steps]

    num_batch= len(seq_indices) // batch_size
    for i in range(0,num_batch*batch_size,batch_size):

        seq_sub_indices=seq_indices[i:i+batch_size]
        X=[data(j) for j in seq_sub_indices]
        Y=[data(j+1) for j in seq_sub_indices]
        yield torch.tensor(X),torch.tensor(Y)

    
myseq=list(range(35))
data_iter=seq_data_iter_random(myseq,2,5)
for x,y in data_iter:
    print(x,'\n',y)
           

tensor([[26, 27, 28, 29, 30],
        [ 1,  2,  3,  4,  5]]) 
 tensor([[27, 28, 29, 30, 31],
        [ 2,  3,  4,  5,  6]])
tensor([[16, 17, 18, 19, 20],
        [ 6,  7,  8,  9, 10]]) 
 tensor([[17, 18, 19, 20, 21],
        [ 7,  8,  9, 10, 11]])
tensor([[21, 22, 23, 24, 25],
        [11, 12, 13, 14, 15]]) 
 tensor([[22, 23, 24, 25, 26],
        [12, 13, 14, 15, 16]])


In [29]:
def seq_data_iter_seq(corpus,batch_size,num_steps):
    offset=random.randint(0,num_steps)
    num_total=((len(corpus)-offset-1)//batch_size)*batch_size
    xs=torch.tensor(corpus[offset:num_total+offset])
    ys=torch.tensor(corpus[offset+1:num_total+offset+1])
    xs,ys=xs.reshape(batch_size,-1),ys.reshape(batch_size,-1)
    num_batch=xs.shape[1]//num_steps
    for i in range(0,num_batch*num_steps,num_steps):
        x=xs[:,i:i+num_steps]
        y=ys[:,i:i+num_steps]
        yield x,y

data_iter=seq_data_iter_seq(myseq,2,5)
for x,y in data_iter:
    print(x,'\n',y)


tensor([[ 3,  4,  5,  6,  7],
        [18, 19, 20, 21, 22]]) 
 tensor([[ 4,  5,  6,  7,  8],
        [19, 20, 21, 22, 23]])
tensor([[ 8,  9, 10, 11, 12],
        [23, 24, 25, 26, 27]]) 
 tensor([[ 9, 10, 11, 12, 13],
        [24, 25, 26, 27, 28]])
tensor([[13, 14, 15, 16, 17],
        [28, 29, 30, 31, 32]]) 
 tensor([[14, 15, 16, 17, 18],
        [29, 30, 31, 32, 33]])


In [33]:
def load_data_time_machine(batch_size,num_steps,use_random_iter=False):
    corpus,vocab=load_corpus_time_machine()
    if use_random_iter:
        return seq_data_iter_random(corpus,batch_size,num_steps),vocab
    else:
        return seq_data_iter_seq(corpus,batch_size,num_steps),vocab

data_iter,vocab=load_data_time_machine(2,5)
i=0
for x,y in data_iter:
    if i==5:
        break
    else:
        print(x,'\n',y)
        i=i+1

tensor([[3, 9, 2, 1, 3],
        [3, 9, 4, 3, 1]]) 
 tensor([[9, 2, 1, 3, 5],
        [9, 4, 3, 1, 3]])
tensor([[ 5, 13,  2,  1, 13],
        [ 3,  9,  5,  8,  1]]) 
 tensor([[13,  2,  1, 13,  4],
        [ 9,  5,  8,  1, 21]])
tensor([[ 4, 15,  9,  5,  6],
        [21, 12,  2,  4, 15]]) 
 tensor([[15,  9,  5,  6,  2],
        [12,  2,  4, 15,  9]])
tensor([[ 2,  1, 21, 19,  1],
        [ 9,  2, 11,  7, 21]]) 
 tensor([[ 1, 21, 19,  1,  9],
        [ 2, 11,  7, 21,  8]])
tensor([[ 9,  1, 18,  1, 17],
        [ 8, 15,  2,  6,  2]]) 
 tensor([[ 1, 18,  1, 17,  2],
        [15,  2,  6,  2,  1]])


In [37]:
from torch.nn import functional as F

X=torch.arange(10).reshape((2,5))
print(X)
test=F.one_hot(X.T,len(vocab))
print(test.shape)
print(test)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
torch.Size([5, 2, 28])
tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
#1 初始化模型参数 x 2*28  w_xh 28*255 w_hh 255*255 w_b 255  h_t 2*255   w_hq 255*28  b_q 28 
#2 初始化 h_0 参数  
#3 实现模型 隐藏层255 词大小28 h_t=tanh(x*w_xh+h_t-1*w_hh+w_b)  o=h_t*W_hq+b_q  
   # 原本x是批次大小*时间序列  转换成时间序列 批次大小 词表大小（独热编码的形式），的数据 在通过循环时间序列计算，最终将结果组合并拼接
    


In [41]:
a=torch.arange(24).reshape((2,3,4))
print(a)
b=torch.arange(24).reshape((2,3,4))
print(b)
c=torch.cat([a,b],dim=2)
print(c)

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
tensor([[[ 0,  1,  2,  3,  0,  1,  2,  3],
         [ 4,  5,  6,  7,  4,  5,  6,  7],
         [ 8,  9, 10, 11,  8,  9, 10, 11]],

        [[12, 13, 14, 15, 12, 13, 14, 15],
         [16, 17, 18, 19, 16, 17, 18, 19],
         [20, 21, 22, 23, 20, 21, 22, 23]]])
