In [21]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [22]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [23]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [24]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [25]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [26]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [27]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [70]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [46]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

72.22099304199219
Clip gradient :  5.336664428366323
53.484554290771484
Clip gradient :  6.0637077022784664
33.196739196777344
Clip gradient :  7.760874881240009
23.45627212524414
Clip gradient :  6.269695517837148
16.426959991455078
Clip gradient :  5.490024063175561
12.70097541809082
Clip gradient :  5.671535515397334
11.786163330078125
Clip gradient :  10.215879641553132
10.311821937561035
Clip gradient :  5.137266996127918
8.896644592285156
Clip gradient :  4.418323277850166
7.8319172859191895
Clip gradient :  4.13546127992571


# Тестирование

In [30]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово

In [108]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [72]:
#Написать реализацию LSTM и обучить предсказывать слово
class my_LSTM(nn.Module):
    def __init__(self, in_size = 5, hidden_size = 3, out_size = 5):
        super (my_LSTM,self).__init__()
        self.activation = nn.Tanh()
        self.hidden1 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden2 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.hidden3 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden4 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.hidden5 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden6 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.hidden7 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden8 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.act = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
        self.c = 1
        
    def forward(self, x, prev_hidden):
        inp = self.act(self.hidden1(x) + self.hidden2(prev_hidden))
        forget = self.act(self.hidden3(x) + self.hidden4(prev_hidden))
        outp = self.act(self.hidden5(x) + self.hidden6(prev_hidden))
        candidate = self.activation(self.hidden7(x) + self.hidden8(prev_hidden))
        self.c = forget * self.c + inp * candidate
        hidden = outp * self.activation(self.c)
        y = self.outweight(hidden)
        return y, hidden

In [75]:
CLIP_GRAD = True
ds = WordDataSet(word=word)
rnn = my_LSTM(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden2.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward(retain_graph=True)
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

72.35601806640625
Clip gradient :  3.638524761829421
67.72064971923828
Clip gradient :  2.6554060415142775
51.04287338256836
Clip gradient :  10.773632401020107
37.76627731323242
Clip gradient :  7.6381911556386015
36.466697692871094
Clip gradient :  13.534629251986109
57.38458251953125
Clip gradient :  289.3133207750561
24.829696655273438
Clip gradient :  8.112080340346823
35.39712905883789
Clip gradient :  18.44751070512961
72.78892517089844
Clip gradient :  40.53078834535044
50.70270919799805
Clip gradient :  37.61302017453754
41.362430572509766
Clip gradient :  33.535560683172804
33.36905288696289
Clip gradient :  67.50821384937122
39.76215744018555
Clip gradient :  24.718746808045672
29.332265853881836
Clip gradient :  4.486298770500658
23.87061309814453
Clip gradient :  8.021148938498106
23.724761962890625
Clip gradient :  24.799826249609538
25.477750778198242
Clip gradient :  11.715265105275524
33.39739227294922
Clip gradient :  23.37700325446712
23.16580581665039
Clip gradient 

In [76]:
rnn.eval()
hh = torch.zeros(rnn.hidden2.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [121]:
#Написать реализацию GRU и обучить предсказывать слово
#Написать реализацию LSTM и обучить предсказывать слово
class my_GRU(nn.Module):
    def __init__(self, in_size = 5, hidden_size = 3, out_size = 5):
        super (my_GRU,self).__init__()
        self.activation = nn.Tanh()
        self.hidden1 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden2 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.hidden3 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden4 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.hidden5 = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden6 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.act = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)

        
    def forward(self, x, prev_hidden):
        update = self.act(self.hidden1(x) + self.hidden2(prev_hidden))
        reset = self.act(self.hidden3(x) + self.hidden4(prev_hidden))
        candidate = self.activation(self.hidden5(x) + self.hidden6(reset * prev_hidden))
        h = (1- update) * candidate + update * prev_hidden
        y = self.outweight(h)
        return y, h

In [126]:
CLIP_GRAD = True
ds = WordDataSet(word=word)
rnn = my_GRU(in_size=ds.vec_size, hidden_size=10, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 300
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden2.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y ,hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward(retain_graph=True)
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

71.98716735839844
Clip gradient :  4.988471649295691
58.69424819946289
Clip gradient :  5.933613025097945
23.030250549316406
Clip gradient :  4.588978697611059
6.0640974044799805
Clip gradient :  3.428807601401312
2.248671054840088
Clip gradient :  3.3912346983126307
0.9059104919433594
Clip gradient :  1.6446052162852034
0.24412250518798828
Clip gradient :  0.23546975280878418
0.11685466766357422
Clip gradient :  0.09932252023226443
0.07844161987304688
Clip gradient :  0.05890358912067171
0.05939197540283203
Clip gradient :  0.03783246439918696
0.048636436462402344
Clip gradient :  0.025589646201160485
0.042057037353515625
Clip gradient :  0.021814562185764358
0.03733539581298828
Clip gradient :  0.01905125975721383
0.033677101135253906
Clip gradient :  0.01717678641427834
0.030701637268066406
Clip gradient :  0.015689288738526964
0.028219223022460938
Clip gradient :  0.014483432413952245
0.026108741760253906
Clip gradient :  0.013484172846603415
0.024278640747070312
Clip gradient :  0

In [127]:
rnn.eval()
hh = torch.zeros(rnn.hidden2.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
