In [1]:
import os
import pandas as pd
import torch
from collections import Counter
import torch.nn as nn
from torch.autograd import Variable
from torchvision import transforms, utils
from torch.utils.data.sampler import SubsetRandomSampler
import argparse
import os
import numpy as np
import pdb
import random

In [2]:
class DataHandler:
    
    
    def read_glove_vecs(self, glove_file):
        with open(glove_file, 'r') as f:
            words = set()
            word_to_vec_map = {}
            for line in f:
                line = line.strip().split()
                curr_word = line[0]
                words.add(curr_word)
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

            i = 1
            words_to_index = {}
            index_to_words = {}
            for w in sorted(words):
                words_to_index[w] = i
                index_to_words[i] = w
                i = i + 1
        return words_to_index, index_to_words, word_to_vec_map


    def load_train(self, folder="./data/", rows=-1):
        for file in os.listdir(folder):
            file_path = os.path.join(os.path.abspath(folder), file)
            if file_path.__contains__("train"):
                if file_path.endswith("en"):
                    file_en = open(file_path)
                    dataset_en = self._read_file(file_en)
                elif file_path.endswith("vi"):
                    file_vi = open(file_path)
                    dataset_vi = self._read_file(file_vi)
        if rows != -1:
            return dataset_en.sample(rows), dataset_vi.sample(rows)
        return dataset_en, dataset_vi
    
    def sentences_to_indices(self, X, word_to_index, max_len):
    
        m = X.shape[0]                                   # number of training examples

        X_indices = np.zeros((m, max_len))

        for i in range(m):                               # loop over training examples

            sentence_words =X[i].lower().split()

            j = 0

            for w in sentence_words:
                X_indices[i, j] = word_to_index[w]
                j = j+1

        ### END CODE HERE ###

        return X_indices

    @staticmethod
    def _read_file(file):

        lines = file.readlines()
        lst_lines = [x.strip() for x in lines]
        return pd.DataFrame(lst_lines)
    
    @staticmethod
    def _tokenize(line, normalize_digits=True):
        
        line = re.sub('<u>', '', line)
        line = re.sub('</u>', '', line)
        line = re.sub('\[', '', line)
        line = re.sub('\]', '', line)
        words = []
        _WORD_SPLIT = re.compile("([.,!?\"'-<>:;)(])")
        _DIGIT_RE = re.compile(r"\d")
        for fragment in line.strip().lower().split():
            for token in re.split(_WORD_SPLIT, fragment):
                if not token:
                    continue
                if normalize_digits:
                    token = re.sub(_DIGIT_RE, '#', token)
                words.append(token)
        return words
    
    @staticmethod
    def _encode_seq(sample, vocab):
        for i, token in enumerate(sample):
            # pdb.set_trace()
            if token in self.vocab:

                sample[i] = self.vocab[token]
            else:
                sample[i] = self.vocab['UNK']
        return sample
    
    def _pad(sample, maxlen):
        if len(sample) > self.maxlen:
            return sample[:self.maxlen]
        pad_required = (self.maxlen - len(sample))*[0]
        return sample + pad_required
    
    def _to_tensor(sample):
        return torch.LongTensor(sample)

In [3]:
from torch.utils.data import Dataset, DataLoader
class LanguageDataset(Dataset):
    def __init__(self, lang_file, root_dir, transform=None):
        with open(os.path.join(root_dir, lang_file), 'r') as f:
            self.sequences = f.readlines()[:50000]
        self.root_dir = root_dir
        self.transform = transform
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.transform(self.sequences[idx]) if self.transform else self.sequences[idx]

class Tokenize(object):
    def compose(self, func1, func2):
        return lambda x:func1(func2(x))
    def __call__(self, sample):
        lower_and_strip = self.compose(str.rstrip, str.lower)
        sample = lower_and_strip(sample)
        return sample.split()

class EncodeSeq(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, sample):
        for i, token in enumerate(sample):
            # pdb.set_trace()
            if token in self.vocab:
                
                sample[i] = self.vocab[token]
            else:
                sample[i] = self.vocab['UNK']
        return sample

class Pad(object):
    def __init__(self, maxlen):
        self.maxlen = maxlen
    def __call__(self, sample):
        if len(sample) > self.maxlen:
            return sample[:self.maxlen]
        pad_required = (self.maxlen - len(sample))*[0]
        return sample + pad_required
class ToTensor(object):
    def __call__(self, sample):
        return torch.LongTensor(sample)


def train(train_x):
    for i_batch, sample_batched in enumerate(train_x):
        x = sample_batched
        print(x.size())


In [4]:

class Lang():
    def __init__(self, sequences, n_words):
        self.sequences = sequences
        self.n_words = n_words
        super(Lang, self).__init__()
    def compose(self, func1, func2):
        return lambda x:func1(func2(x))
    def tokenize(self, sequence):
        lower_and_strip = self.compose(str.rstrip, str.lower)
        sample = lower_and_strip(sequence)
        return sample.split()
    def build_vocab(self):
        self.all = []
        counter = Counter()
        def pool(tokens):
            self.all.extend(tokens)
        tokenize_and_pool = self.compose(pool, self.tokenize)
        for i in range(len(self.sequences)):
            tokenize_and_pool(self.sequences[i])
        return [word for word, _ in Counter(self.all).most_common(self.n_words - 3)]
        # return list(set(self.all))
    def word2index(self):
        index={}
        self.vocab = self.build_vocab()
        self.vocab.insert(0, 'zero'); self.vocab.extend(['UNK','$'])
        for word in self.vocab:
            index[word] = len(index)
        return index
    def index2word(self):
        self.index = self.word2index()
        return {k:i for i, k in self.index.items()}

In [5]:
class EncoderRnn(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout, use_cuda):
        super(EncoderRnn, self).__init__()
        self.hidden_size = hidden_size
        self.use_cuda = use_cuda
        self.embedding = nn.Embedding(vocab_size, hidden_size)
#         self.embedding.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.lstm = nn.LSTMCell(hidden_size, hidden_size)
        self.drop = nn.Dropout(dropout)
    def forward(self, inputs):
        embedded = self.drop(self.embedding(inputs))
        h_t = Variable(torch.zeros(inputs.size(0), self.hidden_size))
        c_t = Variable(torch.zeros(inputs.size(0), self.hidden_size))
        if self.use_cuda:
            h_t = h_t.cuda()
            c_t = c_t.cuda()
        for i, input_t in enumerate(embedded.chunk(embedded.size(1), dim=1)):
            h_t, c_t = self.lstm(torch.squeeze(input_t), (h_t, c_t))
        return h_t

class DecoderRnn(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout, use_cuda):
        super(DecoderRnn, self).__init__()                  
        self.hidden_size = hidden_size
        self.use_cuda = use_cuda
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTMCell(hidden_size, hidden_size)
        self.lstm2 = nn.LSTMCell(hidden_size, hidden_size)
        self.drop = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)
    def forward(self, inputs, hiddens, target_sequences, use_tf):
        embedded = self.drop(self.embedding(inputs))
        target_len = target_sequences.size(1)
        output= []
        h_t = hiddens
        c_t = Variable(torch.zeros(inputs.size(0), self.hidden_size))
        h_t2 = Variable(torch.zeros(inputs.size(0), self.hidden_size))
        c_t2 = Variable(torch.zeros(inputs.size(0), self.hidden_size))
        if self.use_cuda:
            c_t = c_t.cuda()
            h_t2 = h_t2.cuda()
            c_t2 = c_t2.cuda() 	
        batch_size = embedded.size(1)
        for i in range(target_len):
            h_t, c_t = self.lstm(torch.squeeze(embedded), (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            if use_tf:
                embedded = self.drop(self.embedding(target_sequences.permute(1,0)[i]))
            else:
                out = F.softmax(self.linear(h_t2)).max(1)[1]
                embedded = self.drop(self.embedding(out))
            output.append(F.softmax(self.linear(h_t2)))
        output = torch.stack(output)
        return output, h_t

In [6]:

# def pretrained_embedding_layer(word_to_vec_map, word_to_index):
     
#     vocab_len = len(word_to_index) + 1                  
#     emb_dim = word_to_vec_map["cucumber"].shape[0]      
    
#     emb_matrix = np.zeros((vocab_len, emb_dim))
    
#     for word, index in word_to_index.items():
#         emb_matrix[index, :] = word_to_vec_map[word]
    
#     embedding_layer = nn.Embedding(vocab_len, emb_dim)
    
#     embedding_layer.weight.data.copy_(torch.from_numpy(emb_matrix))
    
    
#     return embedding_layer

In [7]:


teacher_forcing_ratio=0.5
def train(batch_input_sequences, batch_target_sequences, 
    criterion, encoder, decoder, lr,  eos_tok, use_cuda):
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    use_tf = True if random.random() < teacher_forcing_ratio else False
    if use_cuda:
        batch_input_sequences = (batch_input_sequences).cuda()
        batch_target_sequences = (batch_target_sequences).cuda()
    batch_input_sequences = Variable(batch_input_sequences)
    batch_target_sequences = Variable(batch_target_sequences)
    encoder.train(); decoder.train()
    encoder_hiddens = encoder(batch_input_sequences)
    decoder_inputs = np.zeros((len(batch_input_sequences), 1))
    for i in range(len(batch_input_sequences)):
        decoder_inputs[i] = eos_tok
    decoder_inputs = torch.from_numpy(decoder_inputs).long()
    if use_cuda:
        decoder_inputs = decoder_inputs.cuda()
    decoder_inputs = Variable(decoder_inputs)
    decoder_hiddens = encoder_hiddens
            
    decoder_outputs, decoder_hiddens= decoder(decoder_inputs, decoder_hiddens, batch_target_sequences, use_tf)
    loss = 0
    for i, d_o in enumerate(decoder_outputs):
        loss += criterion(d_o, batch_target_sequences.permute(1, 0)[i])
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.data[0]
        
def validate(batch_input_sequences, batch_target_sequences, 
    criterion, encoder, decoder, lr,  eos_tok, use_cuda):
    if use_cuda:
        batch_input_sequences = (batch_input_sequences).cuda()
        batch_target_sequences = (batch_target_sequences).cuda()
    batch_input_sequences = Variable(batch_input_sequences)
    batch_target_sequences = Variable(batch_target_sequences)
    encoder.eval(); decoder.eval()
    encoder_hiddens = encoder(batch_input_sequences)
    decoder_inputs = np.zeros((len(batch_input_sequences), 1))
    for i in range(len(batch_input_sequences)):
        decoder_inputs[i] = eos_tok
    decoder_inputs = torch.from_numpy(decoder_inputs).long()
    if use_cuda:
        decoder_inputs = decoder_inputs.cuda()
    decoder_inputs = Variable(decoder_inputs)
    decoder_hiddens = encoder_hiddens
            
    decoder_outputs, decoder_hiddens= decoder(decoder_inputs, decoder_hiddens, batch_target_sequences, 0)
    loss = 0
    # pdb.set_trace()
    for i, d_o in enumerate(decoder_outputs):
        loss += criterion(d_o, batch_target_sequences.permute(1, 0)[i])
    return loss.data[0]


# ap = argparse.ArgumentParser()
# ap.add_argument('-max_len', type=int, default=100)
# ap.add_argument('-vocab_size', type=int, default=100)
# ap.add_argument('-batch_size', type=int, default=128)
# ap.add_argument('-hidden_dim', type=int, default=300)
# ap.add_argument('-dropout', type=float, default=0.3)
# ap.add_argument('-nb_epoch', type=int, default=25)
# ap.add_argument('-learning_rate', type=int, default=0.01)
# ap.add_argument('-log_step', type=int, default=5)
# args = vars(ap.parse_args())

MAX_LEN = 100
VOCAB_SIZE = 50000
BATCH_SIZE =8
HIDDEN_DIM = 300
NB_EPOCH = 25
LEARNING_RATE = 0.001
DROPOUT = 0.3
dh = DataHandler()
def save(model_ft, filename):
    save_filename = filename
    torch.save(model_ft, save_filename)
    print('Saved as %s' % save_filename)
# creating vocabulary
def create_vocab(filename):
    sentences = LanguageDataset(lang_file=filename, root_dir='data')
    lang_obj = Lang(sentences, VOCAB_SIZE)
    return lang_obj.word2index()
# pair wise matching
def pairwise(input_sequences, target_sequences, indices):
    inp = torch.stack([input_sequences[indices[i]] for i in range(len(indices))])
    tar = torch.stack([target_sequences[indices[i]] for i in range(len(indices))])
    return inp, tar


vocab_eng = create_vocab('train.en')
print(len(vocab_eng))
vocab_vi = create_vocab('train.vi')

# words_to_index, index_to_words, word_to_vec_map = dh.read_glove_vecs('data/glove.6B.50d.txt')


#Loading Data
input_sequences = LanguageDataset(lang_file='train.en', root_dir='data',
                                transform=transforms.Compose([Tokenize(),EncodeSeq(vocab_eng),
                                                                Pad(MAX_LEN),
                                                                ToTensor()]))
target_sequences = LanguageDataset(lang_file='train.vi', root_dir='data',
                                transform=transforms.Compose([Tokenize(),EncodeSeq(vocab_vi),
                                                                Pad(MAX_LEN),
                                                                ToTensor()]))


29491


In [8]:
indices = list(range(len(input_sequences)))
split_index = int(0.8*len(indices))
np.random.shuffle(indices)
train_idx, valid_idx = indices[:split_index], indices[split_index:]


train_input_sequences, train_target_sequences = pairwise(input_sequences, target_sequences, train_idx)
val_input_sequences, val_target_sequences = pairwise(input_sequences, target_sequences, valid_idx)
train_size = len(train_idx)
val_size = len(valid_idx)
# pdb.set_trace()
eos_tok = vocab_vi['$']
use_gpu = torch.cuda.is_available()
print("loading encoder and decoder models")
use_gpu = 0
encoder = EncoderRnn(len(vocab_eng), HIDDEN_DIM, DROPOUT, use_gpu)
decoder = DecoderRnn(len(vocab_vi), HIDDEN_DIM, DROPOUT, use_gpu)
if use_gpu:
    print("using GPU")
    encoder = encoder.cuda()
    decoder = decoder.cuda()
criterion = nn.CrossEntropyLoss()
total_step = (train_size/BATCH_SIZE) +1

for epoch in  range(NB_EPOCH):
    step = 0
    try:
        for i in range(0, train_size, BATCH_SIZE):
            step+=1
            num_samples = min(BATCH_SIZE, train_size - i)
            print(num_samples)
            print(len(train_input_sequences))
            train_batch_input_sequences = train_input_sequences[i:i+num_samples]
            train_batch_target_sequences = train_target_sequences[i:i+num_samples]
            train_loss = train(train_batch_input_sequences, train_batch_target_sequences, 
                        criterion, encoder, decoder,
                        LEARNING_RATE, eos_tok, use_gpu)
            print(train_loss)
            val_loss=0.0; count = 0
            for j in range(0, val_size, BATCH_SIZE):
                count+=1	
                num_samples = min(BATCH_SIZE, val_size - j)
                val_batch_input_sequences = val_input_sequences[j:j+num_samples]
                val_batch_target_sequences = val_target_sequences[j:j+num_samples]
                val_loss += validate(val_batch_input_sequences, val_batch_target_sequences, 
                            criterion, encoder, decoder,
                            LEARNING_RATE, eos_tok, use_gpu)
            if step % args['log_step']:
                print('Epoch [%d/%d], Step [%d/%d], Train Loss: %.4f \n'%(epoch+1, NB_EPOCH, step, 
                total_step, train_loss))
                print('Validation Loss %.4f'%(val_loss/float(count)))
    except KeyboardInterrupt:
        print("Saving before quit...")
        save(encoder, 'encoder_[%d/%d].pkl'%(step, total_step))
        save(decoder, 'decoder_[%d/%d].pkl'%(step, total_step))
save(encoder, 'encoder.pkl')
save(decoder, 'decoder.pkl')

loading encoder and decoder models
8
40000


RuntimeError: index out of range at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:191