In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import sklearn
from time import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import random

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [56]:
SOS_token = 0
EOS_token = 1

class Language:

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            if word != ',' and word != '':
                self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [57]:
resource = urllib.request.urlopen("https://raw.githubusercontent.com/ahmadsalimi/DataAnalysisInternship/master/RNN/ferdosi.txt")
content = resource.read().decode(resource.headers.get_content_charset())

In [58]:
verses = content.split('\n')
len(verses)

49610

In [59]:
language = Language('Persian')
for verse in verses:
    language.addSentence(verse)

In [60]:
language.n_words

17660

In [7]:
class Encoder(torch.nn.Module):
    def __init__(self, lang_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(lang_size, emb_dim),
            torch.nn.Dropout(dropout)
        )
        
        self.rnn = torch.nn.LSTM(emb_dim, enc_hid_dim, num_layers=2)

        self.hidden_fc = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim, dec_hid_dim),
            torch.nn.ReLU(inplace=True)
        )

        self.cell_fc = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim, dec_hid_dim),
            torch.nn.ReLU(inplace=True)
        )
        
    def forward(self, src):
        
        embedded = self.embedding(src)
                
        outputs, (hidden, cell) = self.rnn(embedded)

        hidden = self.hidden_fc(hidden)
        cell = self.cell_fc(cell)
        
        return outputs, hidden, cell

In [34]:
class Attention(torch.nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attn = torch.nn.Sequential(
            torch.nn.Linear(enc_hid_dim + 2 * dec_hid_dim, dec_hid_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(dec_hid_dim, 1, bias = False)
        )
        
    def forward(self, hidden, encoder_outputs):
        # hidden            2   B   dec_hid_dim
        # encoder_outputs   L   B   enc_hid_dim
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs   B   L   enc_hid_dim

        #repeat decoder hidden state src_len times
        hidden = torch.cat((hidden[0], hidden[1]), dim=1).unsqueeze(1).repeat(1, src_len, 1)
        # hidden            B   L   dec_hid_dim*2

        return F.softmax(self.attn(torch.cat((encoder_outputs, hidden), dim=2)).squeeze(2), dim=1)

In [52]:
class Decoder(torch.nn.Module):
    def __init__(self, lang_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.attention = attention
        
        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(lang_size, emb_dim),
            torch.nn.Dropout(0.5)
        )

        self.rnn = torch.nn.LSTM(emb_dim + enc_hid_dim, dec_hid_dim, num_layers=2)
        
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(emb_dim + enc_hid_dim + dec_hid_dim, lang_size),
            torch.nn.LogSoftmax(dim=2)
        )
    
    def forward(self, input, encoder_outputs, hidden, cell):
        # input             1   B
        # encoder_outputs   L   B   enc_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        embedded = self.embedding(input)
        # embedded          1   B   emb_dim

        weights = self.attention(hidden, encoder_outputs).unsqueeze(1)
        # weights           B   1   L

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs   B   L   enc_hid_dim

        weighted = torch.bmm(weights, encoder_outputs).permute(1, 0, 2)
        # weighted          1   B   enc_hid_dim

        lstm_input = torch.cat((embedded, weighted), dim=2)
        # lstm_input        1   B   emb_dim + enc_hid_dim

        lstm_output, (hidden, cell) = self.rnn(lstm_input, (hidden, cell))
        # lstm_output       1   B   dec_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        fc_input = torch.cat((lstm_input, lstm_output), dim=2)
        # fc_input          1   B       emb_dim + enc_hid_dim + dec_hid_dim

        pred = self.fc(fc_input)
        # pred              1   B       lang_size

        return pred, hidden, cell

In [88]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, device, lang_size):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

        self.device = device
        self.lang_size = lang_size

    def forward(self, source, target=None, k=1, teacher_forcing_ratio=0.5):
        # source            L   B
        # target            L   B

        if target == None:
            teacher_forcing_ratio = 0

        B = source.shape[1]
        L = source.shape[0]

        outputs = torch.zeros(L, B, self.lang_size).to(self.device)
        # outputs           L   B   lang_size

        encoder_outputs, hidden, cell = self.encoder(source)
        # encoder_outputs   L   B   enc_hid_dim
        # hidden            2   B   dec_hid_dim
        # cell              2   B   dec_hid_dim

        input = torch.tensor([SOS_token] * B, dtype=torch.long, device=self.device)
        # input             B
        for i in range(1, L):

            output, hidden, cell = self.decoder(input.view(1, -1), encoder_outputs, hidden, cell)
            # output        1   B   lang_size
            # hidden        2   B   dec_hid_dim
            # cell          2   B   dec_hid_dim

            output = output.squeeze(0)
            # output        B   lang_size            
            outputs[i] = output

            teacher_force = random.random() < teacher_forcing_ratio

            # top_probs, top_indices = torch.topk(output, k, )
            top = torch.argmax(output, dim=1)
            # top           B

            input = target[i] if teacher_force else top
        
        return outputs


In [78]:
batches = list(get_batches(source_train, target_train, 1, 64))

In [79]:
batch_source = batches[0][1]
batch_source.shape

torch.Size([15, 64])

In [36]:
batch_target = batches[0][2]
batch_target.shape

torch.Size([15, 64])

In [80]:
encoder = Encoder(language.n_words, 100, 200, 300, 0.5)

In [81]:
attention = Attention(200, 300)

In [82]:
decoder = Decoder(language.n_words, 100, 200, 300, 0.5, attention)

In [162]:
seq2seq = Seq2Seq(encoder, decoder, device, language.n_words).cuda()

In [163]:
optimizer = torch.optim.Adam(seq2seq.parameters())

In [214]:
optimizer.zero_grad()
pred = seq2seq(batch_source.cuda(), batch_target.cuda())
pred.shape

torch.Size([15, 64, 17660])

In [215]:
pred = pred.view(-1, language.n_words)

# pred          L*B lang_size

loss = -1 * torch.mean(pred[torch.arange(pred.shape[0]), batch_target.flatten(0)])
loss

tensor(1.7878, device='cuda:0', grad_fn=<MulBackward0>)

In [216]:
loss.backward()
optimizer.step()

In [63]:
def indexesFromSentence(lang, sentence, max_length):
    words = [lang.word2index[word] for word in sentence.split(' ')]
    return [SOS_token] + words + [EOS_token] * (max_length - len(words) - 1) 

def getSentences(lang, sentences, max_length):
    sources = []
    targets = []

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence == '':
            continue

        source, target = sentence.split(',')
        sources.append(indexesFromSentence(lang, source.strip(), max_length))
        targets.append(indexesFromSentence(lang, target.strip(), max_length))
    return torch.tensor(sources, dtype=torch.long).T, torch.tensor(targets, dtype=torch.long).T

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [65]:
max_length = 15

In [66]:
sources, targets = getSentences(language, verses, max_length)
sources.shape, targets.shape

(torch.Size([15, 49609]), torch.Size([15, 49609]))

In [67]:
def train_test_split(X, y, test_size, axis):
    indices = torch.tensor(np.random.permutation(X.shape[axis]))
    train_indices, test_indices = indices[:int(X.shape[axis] * (1 - test_size))], indices[int(X.shape[axis] * (1 - test_size)):]
    return (X.index_select(axis, train_indices),
            X.index_select(axis, test_indices),
            y.index_select(axis, train_indices),
            y.index_select(axis, test_indices))

In [68]:
source_train, source_test, target_train, target_test = train_test_split(sources, targets, test_size=0.2, axis=1)
source_train, source_val, target_train, target_val = train_test_split(source_train, target_train, test_size=0.2, axis=1)

In [69]:
source_train.shape, source_test.shape, source_val.shape

(torch.Size([15, 31749]), torch.Size([15, 9922]), torch.Size([15, 7938]))

In [177]:
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(language.n_words, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(language.n_words, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

In [178]:
optimizer = torch.optim.Adam(model.parameters())

NameError: ignored

In [224]:
criterion = torch.nn.CrossEntropyLoss()

In [14]:
def get_batches(source, target, axis, batch_size):
    num_batches = int(np.ceil(source.shape[axis] * 1.0 / batch_size))

    for batch in range(num_batches):
        yield batch, source[:, batch * batch_size: (batch + 1) * batch_size], target[:, batch * batch_size: (batch + 1) * batch_size]

In [176]:
def train(encoder, decoder, source_train, target_train, optimizer, criterion, batch_size):
    
    model.train()
    
    epoch_loss = 0

    i = 0
    
    for i, b_source, b_target in get_batches(source_train, target_train, 1, batch_size):
        optimizer.zero_grad()
        
        encoder_outputs, hidden, cell = encoder(b_source.to(device))
        pred, _, _ = decoder(b_target, encoder_outputs, hidden, cell)
        
        # b_target      L   B
        # pred          L   B   lang_size

        b_target = batch_target.flatten(0)
        pred = pred.view(-1, language.n_words)
        
        # b_target      L*B
        # pred          L*B lang_size

        loss = -1 * torch.mean(pred[torch.arange(pred.shape[0]), b_target])        
        loss.backward()

        optimizer.step()
        
        epoch_loss += float(loss)
        
    return epoch_loss / (i + 1)

In [226]:
def evaluate(model, source_eval, target_eval, criterion, batch_size):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, b_source, b_target in get_batches(source_eval, target_eval, 1, batch_size):

            b_source = batch.b_source
            b_target = batch.b_target

            output = model(b_source.to(device), b_target.to(device), 0) #turn off teacher forcing

            #b_target = [b_target len, batch size]
            #output = [b_target len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            b_target = b_target[1:].view(-1)

            #b_target = [(b_target len - 1) * batch size]
            #output = [(b_target len - 1) * batch size, output dim]

            loss = criterion(output, b_target)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [227]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [271]:
N_EPOCHS = 10
batch_size = 128

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time()
    
    train_loss = train(model, source_train, target_train, optimizer, criterion, batch_size)
    valid_loss = evaluate(model, source_val, target_val, criterion, batch_size)
    
    end_time = time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

trg_len: 15
input: torch.Size([15, 128])
embedded: torch.Size([15, 128, 256])
outputs: torch.Size([15, 128, 1024])
hidden: torch.Size([2, 128, 512])
hidden: torch.Size([128, 512])
torch.Size([1792, 17660]) torch.Size([1792, 17660])


RuntimeError: ignored

In [261]:
torch.tensor([1, 2, 3, 4, 5, 6]).numel()

6

In [None]:
class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, lang_size, device, dropout_p=0.1, max_length=20):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.lang_size = lang_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.device = device

        self.embedding = torch.nn.Embedding(self.lang_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.lang_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)                                                        #   B   H
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(torch.cat((embedded, hidden), dim=1)), dim=-1)       #   B   M
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))       

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=-1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=self.device)

In [None]:
teacher_forcing_ratio = 0.5

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=20):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    print(f'input length: {input_length}')

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
    
    print('encoder done.')

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def asMinutes(s):
    m = np.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)
    training_verses = [tensorFromSentence(language, np.random.choice(verses)) for i in range(n_iters)]
    criterion = torch.nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        input_tensor = training_verses[iter - 1]
        target_tensor = input_tensor

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=20):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for verse in np.random.choice(verses, size=n):
        print('>', verse)
        output_words, attentions = evaluate(encoder, decoder, verse)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
hidden_size = 256

encoder1 = EncoderRNN(language.n_words, hidden_size, device).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, language.n_words, device).to(device)

trainIters(encoder1, attn_decoder1, int(len(verses) * 0.8), print_every=1)

KeyboardInterrupt: ignored