In [1]:
import re
import math
import psutil
import time
import datetime
from io import open
import random
from random import shuffle
import argparse
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torch.cuda
import sys; sys.argv=['']; del sys

In [2]:

def filterPair(p, max_length):
    filtered = len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length 
    return filtered

def filterPairs(pairs, max_length):
    return [pair for pair in pairs if filterPair(pair, max_length)]


In [3]:

SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang:
    def __init__(self, language):
        self.language_name = language
        self.word_to_index = {"SOS":0, "EOS":1, "<UNK>":2}
        self.word_to_count = {}
        self.index_to_word = {0: "SOS", 1: "EOS", 2: "<UNK>"}
        self.vocab_size = 3
        self.cutoff_point = -1


    def countSentence(self, sentence):
        for word in sentence.split(' '):
            self.countWords(word)

    
    def countWords(self, word):
        if word not in self.word_to_count:
            self.word_to_count[word] = 1
        else:
            self.word_to_count[word] += 1

    
    def createCutoff(self, max_vocab_size):
        word_freqs = list(self.word_to_count.values())
        word_freqs.sort(reverse=True)
        if len(word_freqs) > max_vocab_size:
            self.cutoff_point = word_freqs[max_vocab_size]


    def addSentence(self, sentence):
        new_sentence = ''
        for word in sentence.split(' '):
            unk_word = self.addWord(word)
            if not new_sentence:
                new_sentence =unk_word
            else:
                new_sentence = new_sentence + ' ' + unk_word
        return new_sentence

    def addWord(self, word):
        if self.word_to_count[word] > self.cutoff_point:
            if word not in self.word_to_index:
                self.word_to_index[word] = self.vocab_size
                self.index_to_word[self.vocab_size] = word
                self.vocab_size += 1
            return word
        else:
            return self.index_to_word[2]

In [4]:

def prepareLangs(lang1, lang2, file_path):
    lines = open(file_path[0],'rt', encoding='utf-8').read().strip().split('\n')
    pairs = [[s for s in l.split('\t')] for l in lines]
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [5]:

def prepareData(lang1, lang2, file_path, max_vocab_size,trim, perc_train_set):
    
    input_lang, output_lang, pairs = prepareLangs(lang1, lang2, 
                                                  file_path)
    
    print("Total %s sentence pairs" % len(pairs))
    
    
    if trim != 0:   wwwwwwww
        pairs = filterPairs(pairs, trim)
        print("Trimmed to %s sentence pairs" % len(pairs))

    print("Counting word frequency...")
    for pair in pairs:
        input_lang.countSentence(pair[0])
        output_lang.countSentence(pair[1])


    input_lang.createCutoff(max_vocab_size)
    output_lang.createCutoff(max_vocab_size)

    pairs = [(input_lang.addSentence(pair[0]),output_lang.addSentence(pair[1])) 
             for pair in pairs]

    shuffle(pairs)
    
    train_pairs = pairs[:math.ceil(perc_train_set*len(pairs))]
    test_pairs = pairs[math.ceil(perc_train_set*len(pairs)):]

    print("Train pairs: %s" % (len(train_pairs)))
    print("Test pairs: %s" % (len(test_pairs)))
    print("%s, %s -> %s" % (input_lang.language_name, len(input_lang.word_to_count),
                            input_lang.vocab_size,))
    print("%s, %s -> %s" % (output_lang.language_name, len(output_lang.word_to_count), 
                            output_lang.vocab_size))
    print()
    print(pairs)
   
    return input_lang, output_lang, train_pairs, test_pairs

In [6]:
def indexesFromSentence(lang, sentence):
    indexes = []
    for word in sentence.split(' '):
        try:
            indexes.append(lang.word_to_index[word])
        except:
            indexes.append(lang.word_to_index["<UNK>"])
    return indexes


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = torch.LongTensor(indexes).view(-1)
    return result
      
def tensorsFromPair(input_lang, output_lang, pair):
    input_variable = tensorFromSentence(input_lang, pair[0])
    target_variable = tensorFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)
  

def sentenceFromTensor(lang, tensor):
    raw = tensor.data
    words = []
    for num in raw:
        words.append(lang.index_to_word[num.item()])
    return ' '.join(words)

In [7]:
def batchify(data, input_lang, output_lang, batch_size, shuffle_data=True):
    if shuffle_data == True:
        shuffle(data)
    number_of_batches = len(data) // batch_size
    batches = list(range(number_of_batches))
    longest_elements = list(range(number_of_batches))
    
    for batch_number in range(number_of_batches):
        longest_input = 0
        longest_target = 0
        input_variables = list(range(batch_size))
        target_variables = list(range(batch_size))
        index = 0      
        for pair in range((batch_number*batch_size),((batch_number+1)*batch_size)):
            input_variables[index], target_variables[index] = tensorsFromPair(input_lang, output_lang, data[pair])
            if len(input_variables[index]) >= longest_input:
                longest_input = len(input_variables[index])
            if len(target_variables[index]) >= longest_target:
                longest_target = len(target_variables[index])
            index += 1
        batches[batch_number] = (input_variables, target_variables)
        longest_elements[batch_number] = (longest_input, longest_target)
    return batches , longest_elements, number_of_batches


def pad_batch(batch):
    padded_inputs = torch.nn.utils.rnn.pad_sequence(batch[0],padding_value=EOS_token)
    padded_targets = torch.nn.utils.rnn.pad_sequence(batch[1],padding_value=EOS_token)
    return (padded_inputs, padded_targets)

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size,layers=1,dropout=0.1,
               bidirectional=True):
        super(EncoderRNN, self).__init__()

        if bidirectional:
            self.directions = 2
        else:
            self.directions = 1
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.dropout = dropout
        self.embedder = nn.Embedding(input_size,hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,
                        num_layers=layers,dropout=dropout,
                        bidirectional=bidirectional,batch_first=False)
        self.fc = nn.Linear(hidden_size*self.directions, hidden_size)

    def forward(self, input_data, h_hidden, c_hidden):
        embedded_data = self.embedder(input_data)
        embedded_data = self.dropout(embedded_data)
        hiddens, outputs = self.lstm(embedded_data, (h_hidden, c_hidden))

        return hiddens, outputs

    def create_init_hiddens(self, batch_size):
        h_hidden = Variable(torch.zeros(self.num_layers*self.directions, 
                                    batch_size, self.hidden_size))
        c_hidden = Variable(torch.zeros(self.num_layers*self.directions, 
                                    batch_size, self.hidden_size))
        if torch.cuda.is_available():
            return h_hidden.cuda(), c_hidden.cuda()
        else:
            return h_hidden, c_hidden

In [9]:
class DecoderAttn(nn.Module):
    def __init__(self, hidden_size, output_size, layers=1, dropout=0.1, bidirectional=True):
        super(DecoderAttn, self).__init__()

        if bidirectional:
            self.directions = 2
        else:
            self.directions = 1
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.dropout = dropout
        self.embedder = nn.Embedding(output_size,hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.score_learner = nn.Linear(hidden_size*self.directions, 
                                   hidden_size*self.directions)
        self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,
                        num_layers=layers,dropout=dropout,
                        bidirectional=bidirectional,batch_first=False)
        self.context_combiner = nn.Linear((hidden_size*self.directions)
                                      +(hidden_size*self.directions), hidden_size)
        self.tanh = nn.Tanh()
        self.output = nn.Linear(hidden_size, output_size)
        self.soft = nn.Softmax(dim=1)
        self.log_soft = nn.LogSoftmax(dim=1)


    def forward(self, input_data, h_hidden, c_hidden, encoder_hiddens):

        embedded_data = self.embedder(input_data)
        embedded_data = self.dropout(embedded_data)	
        batch_size = embedded_data.shape[1]
        hiddens, outputs = self.lstm(embedded_data, (h_hidden, c_hidden))	
        top_hidden = outputs[0].view(self.num_layers,self.directions,
                                 hiddens.shape[1],
                                 self.hidden_size)[self.num_layers-1]
        top_hidden = top_hidden.permute(1,2,0).contiguous().view(batch_size,-1, 1)

        prep_scores = self.score_learner(encoder_hiddens.permute(1,0,2))
        scores = torch.bmm(prep_scores, top_hidden)
        attn_scores = self.soft(scores)
        con_mat = torch.bmm(encoder_hiddens.permute(1,2,0),attn_scores)
        h_tilde = self.tanh(self.context_combiner(torch.cat((con_mat,
                                                         top_hidden),dim=1)
                                              .view(batch_size,-1)))
        pred = self.output(h_tilde)
        pred = self.log_soft(pred)


        return pred, outputs

In [10]:

def train_batch(input_batch, target_batch, encoder, decoder, 
                encoder_optimizer, decoder_optimizer, loss_criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(input_batch.shape[1])

    enc_hiddens, enc_outputs = encoder(input_batch, enc_h_hidden, enc_c_hidden)

    decoder_input = Variable(torch.LongTensor(1,input_batch.shape[1]).
                           fill_(output_lang.word_to_index.get("SOS")).cuda()) if use_cuda \
                    else Variable(torch.LongTensor(1,input_batch.shape[1]).
                        fill_(output_lang.word_to_index.get("SOS")))

    dec_h_hidden = enc_outputs[0]
    dec_c_hidden = enc_outputs[1]

    for i in range(target_batch.shape[0]):
        pred, dec_outputs = decoder(decoder_input, dec_h_hidden, 
                                dec_c_hidden, enc_hiddens)

        decoder_input = target_batch[i].view(1,-1)
        dec_h_hidden = dec_outputs[0]
        dec_c_hidden = dec_outputs[1]

        loss += loss_criterion(pred,target_batch[i])


    loss.backward()

    torch.nn.utils.clip_grad_norm_(encoder.parameters(),args.clip)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(),args.clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_batch.shape[0]

In [11]:

def train(train_batches, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_criterion):

    round_loss = 0
    i = 1
    for batch in train_batches:
        i += 1
        (input_batch, target_batch) = pad_batch(batch)
        batch_loss = train_batch(input_batch, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_criterion)
        round_loss += batch_loss

    return round_loss / len(train_batches)

In [12]:

def test_batch(input_batch, target_batch, encoder, decoder, loss_criterion):

    loss = 0

    enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(input_batch.shape[1])

    enc_hiddens, enc_outputs = encoder(input_batch, enc_h_hidden, enc_c_hidden)

    decoder_input = Variable(torch.LongTensor(1,input_batch.shape[1]).
                           fill_(output_lang.word_to_index.get("SOS")).cuda()) if use_cuda \
                    else Variable(torch.LongTensor(1,input_batch.shape[1]).
                        fill_(output_lang.word_to_index.get("SOS")))
    dec_h_hidden = enc_outputs[0]
    dec_c_hidden = enc_outputs[1]

    for i in range(target_batch.shape[0]):
        pred, dec_outputs = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)

        topv, topi = pred.topk(1,dim=1)
        ni = topi.view(1,-1)

        decoder_input = ni
        dec_h_hidden = dec_outputs[0]
        dec_c_hidden = dec_outputs[1]

        loss += loss_criterion(pred,target_batch[i])

    return loss.item() / target_batch.shape[0]

In [13]:

def test(test_batches, encoder, decoder, loss_criterion):

    with torch.no_grad():
        test_loss = 0

        for batch in test_batches:
            (input_batch, target_batch) = pad_batch(batch)
            batch_loss = test_batch(input_batch, target_batch, encoder, decoder, loss_criterion)
            test_loss += batch_loss

    return test_loss / len(test_batches)

In [14]:

def evaluate(encoder, decoder, sentence, cutoff_length):
    with torch.no_grad():
        input_variable = tensorFromSentence(input_lang, sentence)
        input_variable = input_variable.view(-1,1)
        enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(1)

        enc_hiddens, enc_outputs = encoder(input_variable, enc_h_hidden, enc_c_hidden)

        decoder_input = Variable(torch.LongTensor(1,1).fill_(output_lang.word_to_index.get("SOS")).cuda()) if use_cuda \
                        else Variable(torch.LongTensor(1,1).fill_(output_lang.word_to_index.get("SOS")))
        dec_h_hidden = enc_outputs[0]
        dec_c_hidden = enc_outputs[1]

        decoded_words = []

        for di in range(cutoff_length):
            pred, dec_outputs = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)

            topv, topi = pred.topk(1,dim=1)
            ni = topi.item()
            if ni == output_lang.word_to_index.get("EOS"):
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index_to_word[ni])

            decoder_input = Variable(torch.LongTensor(1,1).fill_(ni).cuda()) if use_cuda \
                            else Variable(torch.LongTensor(1,1).fill_(ni))
            dec_h_hidden = dec_outputs[0]
            dec_c_hidden = dec_outputs[1]

        output_sentence = ' '.join(decoded_words)
    
        return output_sentence

In [15]:

def evaluate_randomly(encoder, decoder, pairs, n=2, trim=100):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_sentence = evaluate(encoder, decoder, pair[0],cutoff_length=trim)
        print('<', output_sentence)
        print('')    
        if create_txt:
            f = open(print_to, 'a')
            f.write("\n \
                > %s \n \
                = %s \n \
                < %s \n" % (pair[0], pair[1], output_sentence))
            f.close()

In [16]:
def showPlot(times, losses, fig_name):
    x_axis_label = 'Minutes'
    colors = ('red','blue')
    if max(times) >= 120:
        times = [mins/60 for mins in times]
        x_axis_label = 'Hours'
    i = 0
    for key, losses in losses.items():
        if len(losses) > 0:
            plt.plot(times, losses, label=key, color=colors[i])
            i += 1
    plt.legend(loc='upper left')
    plt.xlabel(x_axis_label)
    plt.ylabel('Loss')
    plt.title('Training Results')
    plt.savefig(fig_name+'.png')
    plt.close('all')
    
def mem():
    if use_cuda:
        mem = torch.cuda.memory_allocated()/1e7
    else:
        mem = psutil.cpu_percent()
    print('Current mem usage:')
    print(mem)
    return "Current mem usage: %s \n" % (mem)

def asHours(s):
    m = math.floor(s / 60)
    h = math.floor(m / 60)
    s -= m * 60
    m -= h * 60
    return '%dh %dm %ds' % (h, m, s)

In [17]:

def train_and_test(epochs, test_eval_every, plot_every, learning_rate, 
                   lr_schedule, train_pairs, test_pairs, input_lang, 
                   output_lang, batch_size, test_batch_size, encoder, decoder, 
                   loss_criterion, trim, save_weights):

    times = []
    losses = {'train set':[], 'test set': []}

    test_batches, longest_seq, n_o_b = batchify(test_pairs, input_lang, 
                                              output_lang, test_batch_size, 
                                              shuffle_data=False)

    start = time.time()
    for i in range(1,epochs+1):
    
        if i in lr_schedule.keys():
            learning_rate /= lr_schedule.get(i)


        encoder.train()
        decoder.train()

        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

        batches, longest_seq, n_o_b = batchify(train_pairs, input_lang, 
                                           output_lang, batch_size, 
                                           shuffle_data=True)
        train_loss = train(batches, encoder, decoder, encoder_optimizer, 
                       decoder_optimizer, loss_criterion)
        
        now = time.time()
        print("Iter: %s \nLearning Rate: %s \nTime: %s \nTrain Loss: %s \n" 
          % (i, learning_rate, asHours(now-start), train_loss))
        
        
        if i % test_eval_every == 0:
            if test_pairs:
                test_loss = test(test_batches, encoder, decoder, criterion)
                print("Test set loss: %s" % (test_loss))
        
        if i % plot_every == 0:
            times.append((time.time()-start)/60)
            losses['train set'].append(train_loss)
            if test_pairs:
                losses['test set'].append(test_loss)
            showPlot(times, losses, output_file_name)
            if save_weights:
                torch.save(encoder.state_dict(), output_file_name+'_enc_weights.pt')
                torch.save(decoder.state_dict(), output_file_name+'_dec_weights.pt')
        

In [18]:
ls

 Volume in drive C has no label.
 Volume Serial Number is 1A75-6536

 Directory of C:\Users\Lenovo\Desktop\major mid eval code old

14-12-2019  23:45    <DIR>          .
14-12-2019  23:45    <DIR>          ..
14-12-2019  11:33    <DIR>          .ipynb_checkpoints
12-12-2018  19:21           332,098 DATASET.txt
14-12-2019  23:45           300,472 encode_preprocess.ipynb
09-02-2018  17:24           332,483 hin.txt
02-10-2019  23:04               203 result.txt
04-12-2019  12:40                95 testdata.orig_trim.10_vocab.20000_directions.2_layers.2_hidden.440_dropout.0.2_learningrate.1_batch.32_epochs.5.txt
14-12-2019  22:44    <DIR>          we
04-12-2019  17:51            16,574 weights.png
14-12-2019  23:44                97 weights.txt
04-12-2019  17:51        46,595,873 weights_dec_weights.pt
04-12-2019  17:51        37,774,063 weights_enc_weights.pt
               9 File(s)     85,351,958 bytes
               4 Dir(s)  25,384,628,224 bytes free


In [19]:
input_lang_name = 'en'
output_lang_name = 'hi'

raw_data_file_path = ('DATASET.txt',)
dataset = 'orig'

trim = 10

max_vocab_size= 20000



perc_train_set = 0.7

In [20]:
test_eval_every = 1

plot_every = 1

create_txt = True

save_weights = True 

In [21]:
bidirectional = True
if bidirectional:
    directions = 2
else:
    directions = 1

layers = 2

hidden_size = 440

dropout = 0.2

batch_size = 32

test_batch_size = 32

epochs = 100

learning_rate= 1

lr_schedule = {}

criterion = nn.NLLLoss()

In [22]:
use_cuda = torch.cuda.is_available()

plt.switch_backend('agg')

output_file_name = "weights"
if create_txt:
    print_to = output_file_name+'.txt'
    with open(print_to, 'w', encoding="utf-8") as f:
        f.write("Starting Training \n")
else:
    print_to = None

input_lang, output_lang, train_pairs, test_pairs = prepareData(
    input_lang_name, output_lang_name, raw_data_file_path, 
    max_vocab_size=max_vocab_size, trim=trim, perc_train_set=perc_train_set) #changes made by ishan
print('Train Pairs #')
print(len(train_pairs))

parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

mem()

encoder = EncoderRNN(input_lang.vocab_size, hidden_size, layers=layers, 
                     dropout=dropout, bidirectional=bidirectional)

decoder = DecoderAttn(hidden_size, output_lang.vocab_size, layers=layers, 
                      dropout=dropout, bidirectional=bidirectional)

print('Encoder and Decoder Created')
mem()

if use_cuda:
    print('Cuda being used')
    encoder = encoder.cuda()
    decoder = decoder.cuda()

print('Number of epochs: '+str(epochs))

if create_txt:
    with open(print_to, 'a') as f:
        f.write('Encoder and Decoder Created\n')
        f.write(mem())
        f.write("Number of epochs %s \n" % (epochs))
'''
train_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule, 
              train_pairs, test_pairs, input_lang, output_lang, batch_size, 
               test_batch_size, encoder, decoder, criterion, trim, save_weights)
'''

Total 2867 sentence pairs
Trimmed to 2396 sentence pairs
Counting word frequency...
Train pairs: 1678
Test pairs: 718
en, 2945 -> 2948
hi, 2653 -> 2656



[('She left the baby crying.', 'उसने बच्चे को रोते हुए छोड़ दिया।'), ('The thief is certain to be caught eventually.', 'चोर अंत में तो पकड़ा ही जाएगा।'), ("Don't be absurd.", 'पागल मत बनो।'), ('She warned him not to go alone.', 'उसने उसे अकेले न जाने की चेतावनी दी।'), ('Who made this pie?', 'यह पाय किसने बनाई है?'), ("I'm an atheist.", 'मैं भगवान में यकीन नहीं करता।'), ("Won't you come to dine with us?", 'क्या आप हमारे साथ खाना खाने नहीं आएंगे?'), ('I have not seen him in months.', 'मैंने उसे महीनों से नहीं देखा है।'), ('I have been writing letters all morning.', 'मैं पूरी सुबह से चिट्ठियाँ लिख रही हूँ।'), ('There are many old temples in Kyoto.', 'क्योटो में बहुत सारे पुराने मंदिर हैं।'), ("It's too expensive!", 'बहुत महंगी है!'), ('My eyes are tired.', 'मेरी आँखें थक गईं हैं।'), ("I'll make you happy.", 'मैं तुम्हें खुश करूँगा।'), ('This car was made in Japan.', 'यह गाड़ी जापान में बनी थी।'), ("I've quit drinking beer.", 'मैंने बीयर पीना छोड़ दिया है।'), ('Insert it wherever you like.

Train Pairs #
1678
Current mem usage:
19.5
Encoder and Decoder Created
Current mem usage:
63.3
Number of epochs: 100
Current mem usage:
75.0


'\ntrain_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule, \n              train_pairs, test_pairs, input_lang, output_lang, batch_size, \n               test_batch_size, encoder, decoder, criterion, trim, save_weights)\n'

In [23]:
enc = torch.load("weights_enc_weights.pt")
dec = torch.load('weights_dec_weights.pt')
encoder.load_state_dict(enc)
decoder.load_state_dict(dec)
outside_sent = "How are you?"
evaluate(encoder, decoder, outside_sent, cutoff_length=10)

'तुम कैसी हो? <EOS>'

In [24]:
outside_sent = "Have fun."
evaluate(encoder, decoder, outside_sent, cutoff_length=10)

'मज़े करना। <EOS>'

In [29]:
outside_sent = "What are you doing?"
evaluate(encoder, decoder, outside_sent, cutoff_length=10)

'तुम क्या कर रहे हो? <EOS>'

In [40]:
outside_sent = "I am studying."
evaluate(encoder, decoder, outside_sent, cutoff_length=10)

'मैं पढ़ रही हूँ। <EOS>'

In [35]:
outside_sent = "Congratulations"
evaluate(encoder, decoder, outside_sent, cutoff_length=10)

'मुबारक हो! <EOS>'