In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [2]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [3]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [4]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [5]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [6]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

# Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [7]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

In [11]:
def filter_output(*kwargs):
    return (kwargs[0], kwargs[1:])

## Инициализация переменных 

In [14]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=7, out_size=ds.vec_size)
vanilla_start_params = (torch.zeros(rnn.hidden.in_features), )

## Обучение

In [58]:
def learning(ds, net, start_hidden_params, e_cnt=100):
    CLIP_GRAD = True
    criterion = nn.CrossEntropyLoss()
    optim     = SGD(net.parameters(), lr = 0.1, momentum=0.9)
    
    for epoch in range(e_cnt):
        hidden_params = start_hidden_params
        loss = 0
        optim.zero_grad()
        
        for sample, next_sample in ds:
            x = ds.get_one_hot(sample).unsqueeze(0)
            target =  torch.LongTensor([next_sample])
            
            y, hidden_params = filter_output(*net(x, *hidden_params))
            
            loss += criterion(y, target)


        loss.backward( retain_graph=True)

        if epoch % 10 == 0:
            print (loss.data.item())
            if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=5))
        else: 
            if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1)

    #     print("Params : ")
    #     num_params = 0
    #     for item in rnn.parameters():
    #         num_params += 1
    #         print(item.grad)
    #     print("NumParams :", num_params)
    #     print("Optimize")

        optim.step()

In [16]:
learning(ds, rnn, vanilla_start_params)

5.372060775756836
Clip gradient :  2.825483806701919
1.3506267070770264
Clip gradient :  1.5056248964000374
0.0091094970703125
Clip gradient :  0.027584052052596172
0.0014925003051757812
Clip gradient :  0.004743617317189838
0.0007624626159667969
Clip gradient :  0.0022821125541011636
0.0005841255187988281
Clip gradient :  0.0017153544780453974
0.0005168914794921875
Clip gradient :  0.001496709232124883
0.00048160552978515625
Clip gradient :  0.0013742137434327164
0.000457763671875
Clip gradient :  0.001285835996576162
0.0004405975341796875
Clip gradient :  0.0012167375987139465


## Тестирование

In [19]:
def test(ds, net, start_hidden_params):
    net.eval()
    softmax  = nn.Softmax(dim=1)
    
    id = 0
    predword = ds.get_char_by_id(id)
    hidden_params = start_hidden_params
    for c in range(len(ds) - 1):
        x = ds.get_one_hot(id).unsqueeze(0)
        
        y, hidden_params = filter_output(*net(x, *hidden_params))
        y = softmax(y)
        
        m, id = torch.max(y, 1)
        id = id.data[0]
        predword += ds.get_char_by_id(id)
    
    print ('Prediction:\t' , predword)
    print("Original:\t", word)
    assert(predword == word)

In [20]:
test(ds, rnn, vanilla_start_params)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [38]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

# Реализовать LSTM

In [139]:
class LSTM(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()        
        
        self.x2c    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.x2i    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.x2f    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.x2o    = nn.Linear(in_features=in_size, out_features=hidden_size)
        
        self.h2c    = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.h2i    = nn.Linear(in_features=hidden_size, out_features=hidden_size)        
        self.h2f    = nn.Linear(in_features=hidden_size, out_features=hidden_size)        
        self.h2o    = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.activation  = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden, prev_cell):
        candidate_cell = self.activation(self.x2c(x) + self.h2c(prev_hidden))
        input_gate  = self.sigmoid(self.x2i(x) + self.h2i(prev_hidden))
        forget_gate = self.sigmoid(self.x2f(x) + self.h2f(prev_hidden))
        output_gate = self.sigmoid(self.x2o(x) + self.h2o(prev_hidden))
        
        cell = forget_gate * prev_cell + input_gate * candidate_cell
        next_hidden = output_gate * self.activation(cell)
        output = self.outweight(next_hidden)
        return output, next_hidden, cell

## Инициализация переменных 

In [150]:
word = 'ololoasdasddqweqw123456789'

In [141]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=7, out_size=ds.vec_size)
base_params = torch.zeros(lstm.h2c.in_features)
lstm_start_params = (base_params, base_params.clone())

## Обучение

In [142]:
learning(ds, lstm, lstm_start_params, e_cnt=200)

71.75397491455078
Clip gradient :  3.3991802167386043
65.08412170410156
Clip gradient :  3.959230754248246
44.83808517456055
Clip gradient :  7.456596479112804
31.306686401367188
Clip gradient :  10.027044319855715
64.57586669921875
Clip gradient :  50.0484574782862
42.71078872680664
Clip gradient :  21.355588776551382
36.351959228515625
Clip gradient :  7.400320848877463
27.535188674926758
Clip gradient :  6.011776114960499
16.64202308654785
Clip gradient :  9.047759056270557
13.88967514038086
Clip gradient :  5.758831329798922
8.740039825439453
Clip gradient :  3.9905241632740185
5.665721893310547
Clip gradient :  2.482155003409084
4.398443698883057
Clip gradient :  1.4455499902300755
3.7307419776916504
Clip gradient :  1.9734192203858738
2.577390670776367
Clip gradient :  3.035892327691981
1.3400287628173828
Clip gradient :  0.8521444207883518
0.6303787231445312
Clip gradient :  0.35416556645571345
0.3552837371826172
Clip gradient :  0.16336651252914072
0.2448263168334961
Clip gradi

## Тестирование

In [143]:
test(ds, lstm, lstm_start_params)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [147]:
class GRU(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()        
        self.x2u    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.x2r    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.x2h    = nn.Linear(in_features=in_size, out_features=hidden_size)
        
        self.h2u    = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.h2r    = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.h2h    = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.activation  = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        update_gate = self.sigmoid(self.x2u(x) + self.h2u(prev_hidden))
        reset_gate = self.sigmoid(self.x2r(x) + self.h2r(prev_hidden))
        hiddent_state = self.activation(self.x2h(x) + self.h2h(reset_gate * prev_hidden))
        
        next_hidden = (1 - update_gate) * hiddent_state + update_gate * prev_hidden
        output = self.outweight(next_hidden)
        return output, next_hidden

## Инициализация переменных 

In [149]:
word = 'ololoasdasddqweqw123456789'

'ololoasdasddqweqw123456789'

In [152]:
ds = WordDataSet(word=word)
gru = GRU(in_size=ds.vec_size, hidden_size=7, out_size=ds.vec_size)
gru_start_params = (torch.zeros(lstm.h2c.in_features), )

## Обучение

In [153]:
learning(ds, gru, gru_start_params, e_cnt=200)

71.4969711303711
Clip gradient :  4.218776590248741
58.83040237426758
Clip gradient :  7.7946891066138635
40.53915023803711
Clip gradient :  7.973134748697141
24.969440460205078
Clip gradient :  6.404975025110897
11.901238441467285
Clip gradient :  3.4857794909960202
4.732065200805664
Clip gradient :  2.4398715475321637
1.970916748046875
Clip gradient :  2.479642894744235
0.6413145065307617
Clip gradient :  1.1579704408184015
0.26062965393066406
Clip gradient :  0.34073289102231424
0.1478433609008789
Clip gradient :  0.16045513693172309
0.10283279418945312
Clip gradient :  0.15107458643146382
0.08113670349121094
Clip gradient :  0.06945719459598802
0.06879997253417969
Clip gradient :  0.03879333147955721
0.06056690216064453
Clip gradient :  0.02646288436612143
0.054451942443847656
Clip gradient :  0.022159373576912937
0.049620628356933594
Clip gradient :  0.02013358908544194
0.04564380645751953
Clip gradient :  0.018460227564917817
0.04229164123535156
Clip gradient :  0.017042698814062

## Тестирование

In [154]:
test(ds, gru, gru_start_params)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
