# Assignment
目前我们可以有
## RNN Class
RNNs 很容易实现，接受一个$x$ vector作为输入并返回一个$y$ vector。 只不过输出的内容不仅仅与当前的输入有关，还与过去的输入是相关的。那么我们可以定义一个RNN的class，通过以下调用方式来实现一次迭代：

In [1]:
# rnn = RNN()
# y = rnn.step(x) # x is an input vector, y is the RNN's output vector

每调用一次`step`， state向量 $h$ 就会被更新一次， 请同学们根据课上所讲内容，完成RNN的定义，并构建一个多层RNN

In [2]:
import numpy as np

In [3]:
class RNNBlock:
    def __init__(self, input_size, hidden_size, output_size):
        self.Wxh = np.random.randn(hidden_size, input_size)*0.01 # input to hidden
        self.Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
        self.Why = np.random.randn(output_size, hidden_size)*0.01 # hidden to output
        self.bh = np.zeros((hidden_size, 1)) # hidden bias
        self.by = np.zeros((output_size, 1)) # output bias
        self.h = np.zeros((hidden_size,1))
        
    # ...
    def step(self, x):
        # update the hidden state
        self.h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, self.h) + self.bh)
        # compute the output vectors
        y = np.dot(self.Why, self.h) + self.by
        return y

In [4]:
hidden_size = 100
x =[1,3,4,5,6,0]
y =[0,1]
rnn = RNNBlock(1, hidden_size, len(y))
for i in x:
    y=rnn.step(i)
    print(y)


[[-0.00155438]
 [-0.00186172]]
[[-0.00454833]
 [-0.00560241]]
[[-0.00588445]
 [-0.00749679]]
[[-0.0073429 ]
 [-0.00937057]]
[[-0.00879182]
 [-0.01124574]]
[[ 6.16119728e-04]
 [-9.81538329e-05]]


## 结合课堂代码，自己实现一个character-level 的RNN model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### 训练数据准备

In [2]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("I like that food".split(), ["DET", "V", "DET", "NN"]),
    ("The boy played the football".split(), ["DET", "NN", "V", "DET",'NN'])
]

In [3]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [4]:
print(word_to_ix)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8, 'I': 9, 'like': 10, 'food': 11, 'boy': 12, 'played': 13, 'football': 14}


In [5]:
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

### 数据转换

In [6]:
def wordToIndex(word):
    return word in word_to_ix.keys()

# Just for demonstration, turn a word into a <1 x n_words> Tensor
def wordToTensor(word):
    tensor = torch.zeros(1, len(word_to_ix))
    if wordToIndex(word)==True:
        tensor[0][word_to_ix[word]] = 1
    return tensor

# Turn a sentence into a <sentence_length x 1 x n_words>,
# or an array of one-hot word vectors
def sentenceToTensor(sentence):
    tensor = torch.zeros(len(sentence), 1, len(word_to_ix))
    for li, word in enumerate(sentence):
        if wordToIndex(word)==True:
            tensor[li][0][word_to_ix[word]] = 1
    return tensor

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

### 训练次数（30次）

In [7]:
epochs=30

### 定义RNN模型

In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

### 运行RNN模型

In [9]:
n_hidden =6
rnn = RNN(len(word_to_ix), n_hidden, len(tag_to_ix))
hidden =torch.zeros(1, n_hidden)
with torch.no_grad():
    inputs =sentenceToTensor(training_data[0][0])
    outputs = torch.zeros(inputs.shape[0], len(tag_to_ix))
    for i in range(inputs.shape[0]):
        output, hidden = rnn(inputs[0],hidden)
        outputs[i] = output
    print('output:',outputs)
    predict=[]
    for i in range(outputs.shape[0]):
        top_n, top_i = outputs[i].topk(1)
        predict.append([k for k, v in tag_to_ix.items() if v == top_i[0].item()][0])
        #print('characteristic:',[k for k, v in tag_to_ix.items() if v == top_i[0].item()])
    print('未经过训练输出的词性：',predict)
    print('正确的词性：',training_data[0][1])

output: tensor([[-1.0089, -1.0143, -1.2993],
        [-0.9909, -0.9943, -1.3517],
        [-0.9849, -0.9931, -1.3622],
        [-0.9859, -0.9901, -1.3652],
        [-0.9864, -0.9894, -1.3654]])
未经过训练输出的词性： ['DET', 'DET', 'DET', 'DET', 'DET']
正确的词性： ['DET', 'NN', 'V', 'DET', 'NN']


In [10]:
rnn = RNN(len(word_to_ix), n_hidden, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.1)
# learning_rate = 0.005

In [11]:
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        hidden = rnn.initHidden()
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        rnn.zero_grad()
        
        inputs =sentenceToTensor(sentence)
        outputs = torch.zeros(inputs.shape[0], len(tag_to_ix))
        targets = prepare_sequence(tags, tag_to_ix)
        for i in range(inputs.shape[0]):
            output, hidden = rnn(inputs[0],hidden)
            outputs[i] = output
        loss = loss_function(outputs, targets)
        loss.backward()
#       retain_graph=True
        optimizer.step()
        

### 训练后运行RNN模型输出结果

In [12]:
with torch.no_grad():
    hidden = rnn.initHidden()
    inputs =sentenceToTensor(training_data[0][0])
    outputs = torch.zeros(inputs.shape[0], len(tag_to_ix))
#     print(outputs)
    for i in range(inputs.shape[0]):
#         print(i)
#         print(inputs[i])
        output, hidden = rnn(inputs[0],hidden)
        outputs[i] = output
        #print('output tensor',output)
        #top_n, top_i = output.topk(1)
#     print(outputs)
    print('output:',outputs)
    predict=[]
    for i in range(outputs.shape[0]):
        top_n, top_i = outputs[i].topk(1)
        predict.append([k for k, v in tag_to_ix.items() if v == top_i[0].item()][0])
        #print('characteristic:',[k for k, v in tag_to_ix.items() if v == top_i[0].item()])
    print('训练',epochs,'次后输出的词性：',predict)
    print('正确的词性：',training_data[0][1])

output: tensor([[-0.6288, -1.2339, -1.7396],
        [-1.1723, -0.8296, -1.3700],
        [-1.1103, -0.8283, -1.4536],
        [-1.1807, -0.7532, -1.5048],
        [-1.1991, -0.7319, -1.5254]])
训练 30 次后输出的词性： ['DET', 'NN', 'NN', 'NN', 'NN']
正确的词性： ['DET', 'NN', 'V', 'DET', 'NN']


## LSTM Class （Optional）
自定义一个LSTM网络并进行训练， 对比simple RNN的效果

### 定义LSTM模型

In [13]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### 未训练直接运行LSTM模型输出结果

In [14]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [15]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    predict=[]
    print('output:',tag_scores)
    for i in range(tag_scores.shape[0]):
        top_n, top_i = tag_scores[i].topk(1)
        predict.append([k for k, v in tag_to_ix.items() if v == top_i[0].item()][0])
        #print('characteristic:',[k for k, v in tag_to_ix.items() if v == top_i[0].item()])
    print('未经过训练输出的词性：',predict)
    print('正确的词性：',training_data[0][1])

output: tensor([[-0.8931, -1.4152, -1.0563],
        [-0.8670, -1.3930, -1.1043],
        [-0.9267, -1.3788, -1.0433],
        [-1.0442, -1.3711, -0.9309],
        [-0.9040, -1.4114, -1.0462]])
未经过训练输出的词性： ['DET', 'DET', 'DET', 'V', 'DET']
正确的词性： ['DET', 'NN', 'V', 'DET', 'NN']


### 训练LSTM模型

In [20]:
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

### 训练后运行LSTM模型输出结果

In [21]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    predict=[]
    print('output:',tag_scores)
    for i in range(tag_scores.shape[0]):
        top_n, top_i = tag_scores[i].topk(1)
        predict.append([k for k, v in tag_to_ix.items() if v == top_i[0].item()][0])
        #print('characteristic:',[k for k, v in tag_to_ix.items() if v == top_i[0].item()])
    print('训练',epochs,'次后输出的词性：',predict)
    print('正确的词性：',training_data[0][1])

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!

output: tensor([[-0.2606, -1.8586, -2.6098],
        [-2.7619, -0.2537, -1.8269],
        [-1.6505, -1.3407, -0.6044],
        [-0.2897, -2.6597, -1.7063],
        [-1.8611, -0.4811, -1.4854]])
训练 30 次后输出的词性： ['DET', 'NN', 'V', 'DET', 'NN']
正确的词性： ['DET', 'NN', 'V', 'DET', 'NN']


## 结论

同样的训练次数，HIDDEN_DIM，loss_function和optimizer,LSTM的准确率比SimpleRNN的要高