In [31]:
import torch
import torch.nn as nn
import re
import os
import unicodedata
import numpy as np

device = torch.device("cpu")
MAX_LENGTH = 10

PAD_token = 0# Used for padding short sentences
SOS_token = 1# Start-of-sentence token
EOS_token = 2# End-of-sentence token

class Voc :
    def __init__(self, name) :
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token : 'PAD', SOS_token : 'SOS', EOS_token : 'EOS'}
        self.num_words = 3
        
    def addSentence(self, sentence) :
        for word in sentence.split() :
            self.addWord(word) 
    def addWord(self, word) :
        if word not in self.word2index :
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else :
            self.word2count[word] += 1
    def trim(self, min_count) :
        if self.trimmed :
            return 
        self.trimmed = True
        keep_words = []
        # Remove words below a certain count threshold
        for k, v in self.word2count.items() :
            if v >= min_count :
                keep_words.append(k)
        print(f'keep_words {len(keep_words)} / {len(word2index)} = {len(keep_words) / len(word2index)}')
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token : 'PAD', SOS_token : 'SOS', EOS_token : 'EOS'}
        self.num_words = 3
        for word in keep_words :
            self.addWord(word)
# 小写并删除非字母字符
def normalizeString(s) :
    s = s.lower()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def indexesFromSentence(voc, sentence) :
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

class EncoderRnn(nn.Module) :
    def __init__(self, hidden_size, embedding, n_layers = 1, dropout = 0) :
        super(EncoderRnn, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size 
        self.embbeding = embedding
        # 初始化GRU;input_size和hidden_size参数都设置为'hidden_size'
        # 因为我们输入的大小是一个有多个特征的词向量== hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else dropout), bidirectional = True)
        
    def forward(self, input_seq, input_lengths, hidden = None) :
        # 将单词索引转换为向量
        embedded = self.embedding(input_seq)
        # 为RNN模块填充批次序列
        packed = torch.nn.utils.rnn.pack_padded_sequence()
        # 正向通过GRU
        outputs, hidden = self.gru(packed, hidden)

[1, 2, 3, 2]


In [46]:
import torch.nn
import numpy as np
gru = torch.nn.GRU(input_size = 10, hidden_size = 20, num_layers = 1)
data = torch.randn(100, 32, 10)
h_0 = torch.randn(1, 32, 20)
out, hidden = gru(data, h_0)
print(out.shape, hidden.shape)
packed = torch.nn.utils.rnn.pack_padded_sequence(data, lengths=[])
print(packed.shape)

torch.Size([100, 32, 20]) torch.Size([1, 32, 20])


RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 2 (batch_size=32)