In [3]:
import numpy as np
import torch
import torch.nn.functional as F
import sklearn
from time import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import random

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [25]:
SOS_token = 0
EOS_token = 1

class Language:

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            if word != ',' and word != '':
                self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [26]:
resource = urllib.request.urlopen("https://raw.githubusercontent.com/ahmadsalimi/DataAnalysisInternship/master/RNN/ferdosi.txt")
content = resource.read().decode(resource.headers.get_content_charset())

In [27]:
verses = content.split('\n')
len(verses)

49610

In [28]:
language = Language('Persian')
for verse in verses:
    language.addSentence(verse)

In [29]:
language.n_words

17660

In [30]:
class Encoder(torch.nn.Module):
    def __init__(self, lang_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(lang_size, emb_dim),
            torch.nn.Dropout(dropout)
        )
        
        self.rnn = torch.nn.LSTM(emb_dim, enc_hid_dim, num_layers=2)

        self.hidden_fc = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim, dec_hid_dim),
            torch.nn.ReLU(inplace=True)
        )

        self.cell_fc = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim, dec_hid_dim),
            torch.nn.ReLU(inplace=True)
        )
        
    def forward(self, src):
        
        embedded = self.embedding(src)
                
        outputs, (hidden, cell) = self.rnn(embedded)

        hidden = self.hidden_fc(hidden)
        cell = self.cell_fc(cell)
        
        return outputs, hidden, cell

In [31]:
class Attention(torch.nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attn = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim + 2 * dec_hid_dim, dec_hid_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(dec_hid_dim, 1, bias = False)
        )
        
    def forward(self, hidden, encoder_outputs):
        # hidden            2   B   dec_hid_dim
        # encoder_outputs   L   B   enc_hid_dim
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs   B   L   enc_hid_dim

        #repeat decoder hidden state src_len times
        hidden = torch.cat((hidden[0], hidden[1]), dim=1).unsqueeze(1).repeat(1, src_len, 1)
        # hidden            B   L   dec_hid_dim*2

        return F.softmax(self.attn(torch.cat((encoder_outputs, hidden), dim=2)).squeeze(2), dim=1)

In [32]:
class Decoder(torch.nn.Module):
    def __init__(self, lang_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.attention = attention
        
        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(lang_size, emb_dim),
            torch.nn.Dropout(0.5)
        )

        self.rnn = torch.nn.LSTM(emb_dim + enc_hid_dim, dec_hid_dim, num_layers=2)
        
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(emb_dim + enc_hid_dim + dec_hid_dim, lang_size),
            torch.nn.LogSoftmax(dim=2)
        )
    
    def forward(self, input, encoder_outputs, hidden, cell):
        # input             1   B
        # encoder_outputs   L   B   enc_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        embedded = self.embedding(input)
        # embedded          1   B   emb_dim

        weights = self.attention(hidden, encoder_outputs).unsqueeze(1)
        # weights           B   1   L

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs   B   L   enc_hid_dim

        weighted = torch.bmm(weights, encoder_outputs).permute(1, 0, 2)
        # weighted          1   B   enc_hid_dim

        lstm_input = torch.cat((embedded, weighted), dim=2)
        # lstm_input        1   B   emb_dim + enc_hid_dim

        lstm_output, (hidden, cell) = self.rnn(lstm_input, (hidden, cell))
        # lstm_output       1   B   dec_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        fc_input = torch.cat((lstm_input, lstm_output), dim=2)
        # fc_input          1   B       emb_dim + enc_hid_dim + dec_hid_dim

        pred = self.fc(fc_input)
        # pred              1   B       lang_size

        return pred, hidden, cell

In [74]:
def index_random_choice(p:torch.Tensor):
    c = p.cumsum(dim=1).cpu().detach().numpy()
    u = np.random.rand(len(c), 1)
    return torch.tensor((u < c).argmax(axis=1), device=p.device)

In [76]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, device, lang_size):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

        self.device = device
        self.lang_size = lang_size

    def forward(self, source, target=None, k=1, teacher_forcing_ratio=0.5, train=True):
        # source            L   B
        # target            L   B

        encoder_outputs, hidden, cell = self.encoder(source)
        # encoder_outputs   L   B   enc_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        if train:
            return self.train_forward(source, target, teacher_forcing_ratio, encoder_outputs, hidden, cell)
        return self.kbeams_forward(source, k, encoder_outputs, hidden, cell)
    
    def train_forward(self, source, target, teacher_forcing_ratio, encoder_outputs, hidden, cell):
        # source            L   B
        # target            L   B

        B = source.shape[1]
        L = source.shape[0]

        outputs = torch.zeros(L, B, self.lang_size).to(self.device)
        # outputs           L   B   lang_size

        input = torch.tensor([SOS_token] * B, dtype=torch.long, device=self.device)
        # input             B

        for i in range(1, L):

            output, hidden, cell = self.decode(input, encoder_outputs, hidden, cell)
            # output        B   lang_size            
            # hidden        2   B   dec_hid_dim
            # cell          2   B   dec_hid_dim

            outputs[i] = output

            top = index_random_choice(output)
            # top       B

            teacher_force = random.random() < teacher_forcing_ratio    

            input = target[i] if teacher_force else top
        
        return outputs

    def kbeams_forward(self, source, k, encoder_outputs, hidden, cell):
        # source            L   B
        
        B = source.shape[1]
        L = source.shape[0]

        outputs = torch.zeros(L, B, k).to(self.device)
        # outputs           L   B   k

        temp_k = 1

        input = torch.tensor([SOS_token] * B, dtype=torch.long, device=self.device)
        # input             B

        output, hidden, cell = self.decode(input, encoder_outputs, hidden, cell)
        # output            B   lang_size            
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        p, indices = output.topk(k=k, dim=1)
        # p, indices        B   k

        outputs[1] = indices

        beam_hidden = hidden.unsqueeze(0).repeat(k, 1, 1, 1)
        # beam_hidden       k   2   B   dec_hid_dim
        beam_cell = cell.unsqueeze(0).repeat(k, 1, 1, 1)
        # cell              k   2   B   dec_hid_dim

        beams_p = p
        # beams_p           B   k

        for i in range(2, L):
            candidates_indices = np.zeros(k, B, k)
            candidates_p = np.zeros(k, B, k)
            
            for beam in range(k):
                input = outputs[i-1, :, beam]
                # input             B

                output, hidden, cell = self.decode(input, encoder_outputs, beam_hidden[beam], beam_cell[beam])
                # output            B   lang_size      
                # hidden            2   B   dec_hid_dim
                # cell              2   B   dec_hid_dim

                p, indices = output.topk(k=k, dim=1)
                # p, indices        B   k

                candidates_indices[beam] = indices
                candidates_p[beam] = p * beams_p[:, beam].unsqueeze(1).repeat(k)

                beam_hidden[beam] = hidden
                beam_cell[beam] = cell
            
            flatten_p = candidates_p.permute(1, 0, 2).view(B, -1)
            # flatten_p             B   k^2

            new_p, new_indices = flatten_p.topk(k=k, dim=1)
            # new_p, new_indices    B   k

            current_word_indices = candidates_indices.permute(1, 0, 2).view(B, -1)[new_indices]
            # current_word_indices  B   k

            last_beam_indices = (new_indices // k).unsqueeze(0).repeat(L, 1, 1)
            # last_beam_indices     L   B   k

            outputs = outputs.gather(dim=2, index=last_beam_indices)
            outputs[i] = current_word_indices

            beams_p = new_p
        
        return outputs

    def decode(self, input, encoder_outputs, hidden, cell):
        # input             B
        # encoder_outputs   L   B
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim
        output, hidden, cell = self.decoder(input.view(1, -1), encoder_outputs, hidden, cell)
        return output.squeeze(0), hidden, cell

In [5]:
L = 10
B = 20
k = 5

In [15]:
a = torch.ones(L, B, k)
a.shape

torch.Size([10, 20, 5])

In [23]:
indices = torch.ones(B, k, dtype=torch.long).unsqueeze(0).repeat(L, 1, 1)
indices.shape

torch.Size([10, 20, 5])

In [24]:
indices.gather(dim=2, index=indices).shape

torch.Size([10, 20, 5])

In [19]:
a[:].shape

torch.Size([10, 20, 5])

In [14]:
i

tensor([[258, 310],
        [829, 224],
        [820, 589],
        [207,  32],
        [436, 258],
        [442, 144],
        [185, 337],
        [642, 269],
        [285, 321],
        [601, 452],
        [525,  68],
        [662, 699],
        [933, 204],
        [351,   0],
        [315, 742],
        [257, 884],
        [894, 704],
        [527, 259],
        [ 32, 522],
        [854, 691],
        [ 47, 476],
        [828, 240],
        [721, 274],
        [246, 563],
        [ 48, 673],
        [  1, 795],
        [ 99, 646],
        [  3, 588],
        [418, 274],
        [ 58,  81]])

In [27]:
(u < c)

array([[False,  True,  True],
       [ True,  True,  True],
       [False,  True,  True]])

In [28]:
(u < c).argmax(axis=1)

array([1, 0, 1])

In [78]:
batches = list(get_batches(source_train, target_train, 1, 64))

In [79]:
batch_source = batches[0][1]
batch_source.shape

torch.Size([15, 64])

In [36]:
batch_target = batches[0][2]
batch_target.shape

torch.Size([15, 64])

In [80]:
encoder = Encoder(language.n_words, 100, 200, 300, 0.5)

In [81]:
attention = Attention(200, 300)

In [82]:
decoder = Decoder(language.n_words, 100, 200, 300, 0.5, attention)

In [162]:
seq2seq = Seq2Seq(encoder, decoder, device, language.n_words).cuda()

In [163]:
optimizer = torch.optim.Adam(seq2seq.parameters())

In [35]:
def indexesFromSentence(lang, sentence, max_length):
    words = [lang.word2index[word] for word in sentence.split(' ')]
    return [SOS_token] + words + [EOS_token] * (max_length - len(words) - 1) 

def getSentences(lang, sentences, max_length):
    sources = []
    targets = []

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence == '':
            continue

        source, target = sentence.split(',')
        sources.append(indexesFromSentence(lang, source.strip(), max_length))
        targets.append(indexesFromSentence(lang, target.strip(), max_length))
    return torch.tensor(sources, dtype=torch.long).T, torch.tensor(targets, dtype=torch.long).T

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
max_length = 15

In [38]:
sources, targets = getSentences(language, verses, max_length)
sources.shape, targets.shape

(torch.Size([15, 49609]), torch.Size([15, 49609]))

In [39]:
def train_test_split(X, y, test_size, axis):
    indices = torch.tensor(np.random.permutation(X.shape[axis]))
    train_indices, test_indices = indices[:int(X.shape[axis] * (1 - test_size))], indices[int(X.shape[axis] * (1 - test_size)):]
    return (X.index_select(axis, train_indices),
            X.index_select(axis, test_indices),
            y.index_select(axis, train_indices),
            y.index_select(axis, test_indices))

In [40]:
source_train, source_test, target_train, target_test = train_test_split(sources, targets, test_size=0.2, axis=1)
source_train, source_val, target_train, target_val = train_test_split(source_train, target_train, test_size=0.2, axis=1)

In [41]:
source_train.shape, source_test.shape, source_val.shape

(torch.Size([15, 31749]), torch.Size([15, 9922]), torch.Size([15, 7938]))

In [80]:
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(language.n_words, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
decoder = Decoder(language.n_words, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(encoder, decoder, device, language.n_words).to(device)

In [81]:
optimizer = torch.optim.Adam(model.parameters())

In [61]:
def get_batches(source, target, axis, batch_size):
    num_batches = int(np.ceil(source.shape[axis] * 1.0 / batch_size))

    for batch in range(num_batches):
        yield batch, source[:, batch * batch_size: min((batch + 1) * batch_size, source.shape[axis])], target[:, batch * batch_size: min((batch + 1) * batch_size, source.shape[axis])]

In [62]:
def train(model, source_train, target_train, optimizer, batch_size):
    
    model.train()
    
    epoch_loss = 0

    i = 0
    
    for i, b_source, b_target in get_batches(source_train, target_train, 1, batch_size):
        optimizer.zero_grad()
        
        pred = model(b_source.cuda(), b_target.cuda())
        
        # b_target      L   B
        # pred          L   B   lang_size

        b_target = b_target.flatten(0)
        pred = pred.view(-1, language.n_words)
        
        # b_target      L*B
        # pred          L*B lang_size

        loss = -1 * torch.mean(pred[torch.arange(pred.shape[0]), b_target])        
        loss.backward()

        optimizer.step()
        
        epoch_loss += float(loss)

        if i % 100 == 0:
            print(f'[Train] loss: {epoch_loss / (i + 1):.3e}')
        
    return epoch_loss / (i + 1)

In [63]:
def evaluate(model, source_eval, target_eval, batch_size, k):
    
    model.eval()
    
    epoch_loss = 0

    predictions = []
    
    with torch.no_grad():
    
        for i, b_source, b_target in get_batches(source_eval, target_eval, 1, batch_size):

            pred = model(b_source.cuda(), k=k, teacher_forcing_ratio=0, train=False)
            
            # b_target      L   B
            # pred          L   B   k

            b_target = b_target.unsqueeze(2).repeat(1, 1, k).flatten(0)
            flat_pred = pred.flatten(0)
            
            predictions.append(pred.cpu().numpy())

            # b_target      L * B * k
            # flat_pred     L * B * k

            loss = -1 * torch.mean(flat_pred[torch.arange(flat_pred.shape[0]), b_target])        

            epoch_loss += loss.item()

            if i % 100 == 0:
                print(f'[Eval]  loss: {epoch_loss / (i + 1):.3e}')
        
    return np.concatenate(np.concatenate(predictions, axis=1), axis=1), epoch_loss / (i + 1)

In [64]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 2
batch_size = 128

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time()
    
    train_loss = train(model, source_train, target_train, optimizer, batch_size)
    _, valid_loss = evaluate(model, source_val, target_val, batch_size, 5)
    
    end_time = time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3e} | Train PPL: {np.exp(train_loss):.3e}')
    print(f'\t Val. Loss: {valid_loss:.3e} |  Val. PPL: {np.exp(valid_loss):.3e}')

[Train] loss: 9.239e+00


In [308]:
indices = np.random.choice(np.arange(source_test.shape[1]), size=40, replace=False)
source = source_test[:, indices]
target = target_test[:, indices]
pred, valid_loss = evaluate(model, source, target, batch_size)

print(pred.shape)

for index in range(pred.shape[1]):
    print('>', ' '.join(list(map(lambda i: language.index2word[int(i)], source[:, index]))))
    print('<', ' '.join(list(map(lambda i: language.index2word[int(i)], pred[:, index]))))
    print('=', ' '.join(list(map(lambda i: language.index2word[int(i)], target[:, index]))))
    print('-----------------')



[Eval]  loss: 2.808e+00
(15, 40)
> SOS بماند سخن چین ودوروی دیو EOS EOS EOS EOS EOS EOS EOS EOS EOS
< SOS که از و غریو غریو EOS EOS EOS EOS EOS EOS EOS EOS EOS
= SOS بریده دل از بیم کیهان خدیو EOS EOS EOS EOS EOS EOS EOS EOS
-----------------
> SOS که از من پس از مرگ ماند نشان EOS EOS EOS EOS EOS EOS
< SOS به از از گردنکشان گردنکشان EOS EOS EOS EOS EOS EOS EOS EOS EOS
= SOS ز گنج شهنشاه گردنکشان EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
-----------------
> SOS بد اندیش یاران او را براند EOS EOS EOS EOS EOS EOS EOS EOS
< SOS به پیش درشگفتی او براند EOS EOS EOS EOS EOS EOS EOS EOS EOS
= SOS جز از شاه و پیروز خسرو نماند EOS EOS EOS EOS EOS EOS EOS
-----------------
> SOS برانم که او را سوی خان خویش EOS EOS EOS EOS EOS EOS EOS
< SOS به شمع با او و خویش خویش EOS EOS EOS EOS EOS EOS EOS
= SOS برم تا بدارمش چون جان خویش EOS EOS EOS EOS EOS EOS EOS EOS
-----------------
> SOS چو گاو از سر کوه بنداختند EOS EOS EOS EOS EOS EOS EOS EOS
< SOS ز لشکر روستا و EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
= 

In [267]:
def asMinutes(s):
    m = np.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [268]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for verse in np.random.choice(verses, size=n):
        print('>', verse)
        output_words, attentions = evaluate(encoder, decoder, verse)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
hidden_size = 256

encoder1 = EncoderRNN(language.n_words, hidden_size, device).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, language.n_words, device).to(device)

trainIters(encoder1, attn_decoder1, int(len(verses) * 0.8), print_every=1)

KeyboardInterrupt: ignored