In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [2]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [3]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [4]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [5]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [6]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [7]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [8]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [9]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

5.408415794372559
Clip gradient :  2.0042740962377965
2.020293712615967
Clip gradient :  0.8624764671994417
0.20903992652893066
Clip gradient :  0.8241411322144939
0.012166976928710938
Clip gradient :  0.020220904906245327
0.004754543304443359
Clip gradient :  0.007251908339518487
0.003390789031982422
Clip gradient :  0.009994445892807095
0.0028543472290039062
Clip gradient :  0.005906933152756055
0.0025954246520996094
Clip gradient :  0.0036425964752553803
0.002453327178955078
Clip gradient :  0.0033535327426986858
0.0023436546325683594
Clip gradient :  0.003012177951629932


# Тестирование

In [10]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [11]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [12]:
class Gate(nn.Module):
    
    def __init__(self, in_size, gate_size, activation=nn.Tanh):
        super(Gate, self).__init__()
        self.ig = nn.Linear(in_features=in_size, out_features=gate_size)
        self.gg = nn.Linear(in_features=gate_size, out_features=gate_size)
        self.activation = activation()
        
    def forward(self, x, prev_gate):
        return self.activation(self.ig(x) + self.gg(prev_gate))

In [13]:
class LSTM(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_gate = Gate(in_size, hidden_size, activation=nn.Sigmoid)
        self.forget_gate = Gate(in_size, hidden_size, activation=nn.Sigmoid)
        self.output_gate = Gate(in_size, hidden_size, activation=nn.Sigmoid)
        self.state_candidate = Gate(in_size, hidden_size, activation=nn.Tanh)
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden, prev_state):
        i = self.input_gate(x, prev_hidden)
        f = self.forget_gate(x, prev_hidden)
        o = self.output_gate(x, prev_hidden)
        state_candidate = self.state_candidate(x, prev_hidden)
        
        state = f*prev_state + i*state_candidate 
        hidden = o * torch.tanh(state)
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden, state

In [14]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=15, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 130
optim     = SGD(lstm.parameters(), lr = 0.1, momentum=0.9)

In [15]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(lstm.hidden_size)
    cc = torch.zeros(lstm.hidden_size)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh, cc = lstm(x, hh, cc)
        loss += criterion(y, target)

    loss.backward()

    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=1)
    
    optim.step()

70.8961181640625
Clip gradient :  3.0303555744452684
64.54291534423828
Clip gradient :  3.837456558405693
38.116920471191406
Clip gradient :  6.981557016215022
19.705001831054688
Clip gradient :  6.355532440275047
12.815622329711914
Clip gradient :  93.45949444645372
12.74820327758789
Clip gradient :  17.39515470429785
11.610527992248535
Clip gradient :  39.45884801595361
6.562996864318848
Clip gradient :  3.7001899876446354
1.489328384399414
Clip gradient :  0.9010558594136936
0.2856416702270508
Clip gradient :  0.22849998888146236
0.10794353485107422
Clip gradient :  0.07526698765522338
0.06428337097167969
Clip gradient :  0.03721807877439411
0.04860115051269531
Clip gradient :  0.026141645408993514


In [16]:
lstm.eval()
hh = torch.zeros(lstm.hidden_size)
cc = torch.zeros(lstm.hidden_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh, cc = lstm(x, hh, cc)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [17]:
class GRU(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.update_gate = Gate(in_size, hidden_size, activation=nn.Sigmoid)
        self.reset_gate = Gate(in_size, hidden_size, activation=nn.Sigmoid)
        self.xh = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hh = nn.Linear(in_features=hidden_size, out_features=hidden_size) 
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        u = self.update_gate(x, prev_hidden)
        r = self.reset_gate(x, prev_hidden)
        hidden_candidate = torch.tanh(self.xh(x) + self.hh(r*prev_hidden))
        hidden = (1 - u)*hidden_candidate + u*prev_hidden
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

In [18]:
ds = WordDataSet(word=word)
gru = GRU(in_size=ds.vec_size, hidden_size=15, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 130
optim     = SGD(gru.parameters(), lr = 0.1, momentum=0.9)

In [19]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(gru.hidden_size)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = gru(x, hh)
        loss += criterion(y, target)

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=1)
    
    optim.step()

70.77413177490234
Clip gradient :  4.45148456459335
55.90251922607422
Clip gradient :  6.2066648684466195
22.960649490356445
Clip gradient :  7.322970524713729
8.149758338928223
Clip gradient :  5.941270894291109
2.211825370788574
Clip gradient :  5.5564409059663795
1.1613740921020508
Clip gradient :  2.893966523587728
0.2276744842529297
Clip gradient :  0.259984476017209
0.09227275848388672
Clip gradient :  0.08825876663328469
0.05027008056640625
Clip gradient :  0.044313663493241284
0.03474235534667969
Clip gradient :  0.027627586873717542
0.02749919891357422
Clip gradient :  0.018901070864829266
0.02359485626220703
Clip gradient :  0.01602984498162859
0.02093982696533203
Clip gradient :  0.01383102659805706


In [20]:
gru.eval()
hh = torch.zeros(lstm.hidden_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = gru(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
