In [1]:
import re
import os
import math
import glob
import time
import torch
import random
import itertools
import numpy as np
import unicodedata
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
# global_config.py
MODEL_PREFIX = 'seq2seq_translate'
CHECKPOINT_DIR = './checkpoints'
MAX_LENGTH = 10

use_cuda = torch.cuda.is_available()
teacher_forcing_ratio = 0.5

In [3]:
# model_util.py
def load_previous_model(encoder, decoder, checkpoint_dir, model_prefix):
    """
    this can generally used in PyTorch to load previous model,
    this function will find max epoch from checkpoints dir, for other models
    just change model load format.
    :param encoder:
    :param decoder:
    :param checkpoint_dir:
    :param model_prefix:
    :return:
    """
    f_list = glob.glob(os.path.join(checkpoint_dir, model_prefix) + '-*.pth')
    start_epoch = 1
    if len(f_list) >= 1:
        epoch_list = [int(i.split('-')[-1].split('.')[0]) for i in f_list]
        last_checkpoint = f_list[np.argmax(epoch_list)]
        if os.path.exists(last_checkpoint):
            print('load from {}'.format(last_checkpoint))
            model_state_dict = torch.load(last_checkpoint, map_location=lambda storage, loc: storage)
            encoder.load_state_dict(model_state_dict['encoder'])
            decoder.load_state_dict(model_state_dict['decoder'])
            start_epoch = np.max(epoch_list)
    return encoder, decoder, start_epoch


def save_model(encoder, decoder, checkpoint_dir, model_prefix, epoch, max_keep=5):
    """
    this method can be used in PyTorch to save model,
    this will save model with prefix and epochs.
    :param encoder:
    :param decoder:
    :param checkpoint_dir:
    :param model_prefix:
    :param epoch:
    :param max_keep:
    :return:
    """
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    f_list = glob.glob(os.path.join(checkpoint_dir, model_prefix) + '-*.pth')
    if len(f_list) >= max_keep + 2:
        # this step using for delete the more than 5 and litter one
        epoch_list = [int(i.split('-')[-1].split('.')[0]) for i in f_list]
        to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]]
        for f in to_delete:
            os.remove(f)
    name = model_prefix + '-{}.pth'.format(epoch)
    file_path = os.path.join(checkpoint_dir, name)
    model_dict = {
        'encoder': encoder.state_dict(),
        'decoder': decoder.state_dict()
    }
    torch.save(model_dict, file_path)


def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / percent
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [4]:
# data_loader.py
class PairDataLoader(object):
    """
    this class load raw file and generate pair data.
    """

    def __init__(self):

        self.SOS_token = 0
        self.EOS_token = 1
        self.eng_prefixes = (
            "i am ", "i m ",
            "he is", "he s ",
            "she is", "she s",
            "you are", "you re ",
            "we are", "we re ",
            "they are", "they re "
        )

        self._prepare_data('eng', 'fra')

    class Lang(object):

        def __init__(self, name):
            self.name = name
            self.word2index = {}
            self.word2count = {}
            self.index2word = {0: "SOS", 1: "EOS"}
            self.n_words = 2  # Count SOS and EOS

        def add_sentence(self, sentence):
            for word in sentence.split(' '):
                self.add_word(word)

        def add_word(self, word):
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count[word] += 1

    def filter_pair(self, p):
        return len(p[0].split(' ')) < MAX_LENGTH and \
               len(p[1].split(' ')) < MAX_LENGTH and \
               p[0].startswith(self.eng_prefixes)

    def filter_pairs(self, pairs):
        return [pair for pair in pairs if self.filter_pair(pair)]

    @staticmethod
    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    def normalize_string(self, s):
        s = self.unicode_to_ascii(s).lower().strip()
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        return s

    def read_lang(self, lang1, lang2, reverse=False):
        print("Reading lines...")
        lines = open('../data/interim/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
            read().strip().split('\n')
        pairs = [[self.normalize_string(s) for s in l.split('\t')] for l in lines]
        if reverse:
            pairs = [list(reversed(p)) for p in pairs]
            input_lang = self.Lang(lang2)
            output_lang = self.Lang(lang1)
        else:
            input_lang = self.Lang(lang1)
            output_lang = self.Lang(lang2)

        return input_lang, output_lang, pairs

    @staticmethod
    def indexes_from_sentence(lang, sentence):
        return [lang.word2index[word] for word in sentence.split(' ')]

    def variable_from_sentence(self, lang, sentence):
        indexes = self.indexes_from_sentence(lang, sentence)
        indexes.append(self.EOS_token)
        result = Variable(torch.LongTensor(indexes).view(-1, 1))
        if use_cuda:
            return result.cuda()
        else:
            return result

    def _prepare_data(self, lang1, lang2, reverse=False):
        input_lang, output_lang, pairs = self.read_lang(lang1, lang2, reverse)
        print("Read %s sentence pairs" % len(pairs))
        self.pairs = self.filter_pairs(pairs)
        print("Trimmed to %s sentence pairs" % len(self.pairs))
        print("Counting words...")
        for pair in self.pairs:
            input_lang.add_sentence(pair[0])
            output_lang.add_sentence(pair[1])
        self.input_lang = input_lang
        self.output_lang = output_lang
        print("Counted words:")
        print(input_lang.name, input_lang.n_words)
        print(output_lang.name, output_lang.n_words)

    def get_pair_variable(self):
        input_variable = self.variable_from_sentence(self.input_lang, random.choice(self.pairs)[0])
        target_variable = self.variable_from_sentence(self.output_lang, random.choice(self.pairs)[1])
        return input_variable, target_variable

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs).view(1, 1, -1)
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, inputs, hidden):
        output = self.embedding(inputs).view(1, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, inputs, hidden, encoder_output, encoder_outputs):
        embedded = self.embedding(inputs).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights

    def init_hidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [16]:
def train_model(data_loader, input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer,
                criterion,
                max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    print(input_variable.shape)
    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0
    try:
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(
                input_variable[ei], encoder_hidden)
            encoder_outputs[ei] = encoder_output[0][0]
    except KeyboardInterrupt:
        return

    decoder_input = Variable(torch.LongTensor([[data_loader.SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        try:
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_output, encoder_outputs)
                loss += criterion(decoder_output, target_variable[di])
                decoder_input = target_variable[di]  # Teacher forcing
        except KeyboardInterrupt:
            return

    else:
        # Without teacher forcing: use its own predictions as the next input
        try:
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_output, encoder_outputs)
                topv, topi = decoder_output.data.topk(1)
                ni = topi[0][0]

                decoder_input = Variable(torch.LongTensor([[ni]]))
                decoder_input = decoder_input.cuda() if use_cuda else decoder_input
                loss += criterion(decoder_output, target_variable[di])
                if ni == data_loader.EOS_token:
                    break
        except KeyboardInterrupt:
            return

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length


def train(data_loader, encoder, decoder, n_epochs, print_every=100, save_every=1000, evaluate_every=100,learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    encoder, decoder, start_epoch = load_previous_model(encoder, decoder, CHECKPOINT_DIR, MODEL_PREFIX)

    for epoch in range(start_epoch, n_epochs + 1):

        input_variable, target_variable = data_loader.get_pair_variable()

        try:
            loss = train_model(data_loader, input_variable, target_variable, encoder,
                               decoder, encoder_optimizer, decoder_optimizer, criterion)
        except KeyboardInterrupt:
            pass
        print_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs),
                                         epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % save_every == 0:
            save_model(encoder, decoder, CHECKPOINT_DIR, MODEL_PREFIX, epoch)

        if epoch % evaluate_every == 0:
            evaluate_randomly(data_loader, encoder, decoder, n=1)


def evaluate(data_loader, encoder, decoder, sentence, max_length=MAX_LENGTH):
    input_variable = data_loader.variable_from_sentence(data_loader.input_lang, sentence)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.init_hidden()

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei],
                                                 encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

    decoder_input = Variable(torch.LongTensor([[data_loader.SOS_token]]))  # SOS
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_output, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == data_loader.EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(data_loader.output_lang.index2word[ni.item()])

        decoder_input = Variable(torch.LongTensor([[ni]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]


def evaluate_randomly(data_loader, encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(data_loader.pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(data_loader, encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [17]:
def main():

    pair_data_loader = PairDataLoader()
    hidden_size = 256
    encoder1 = EncoderRNN(pair_data_loader.input_lang.n_words, hidden_size)
    attn_decoder1 = AttnDecoderRNN(hidden_size, pair_data_loader.output_lang.n_words,1, dropout_p=0.1)

    if use_cuda:
        encoder1 = encoder1.cuda()
        attn_decoder1 = attn_decoder1.cuda()
    print('start training...')
    pair_data_loader.get_pair_variable()
    train(pair_data_loader, encoder1, attn_decoder1, 75000)
    evaluate_randomly(pair_data_loader, encoder1, attn_decoder1)

In [18]:
main()

Reading lines...
Read 141382 sentence pairs
Trimmed to 11132 sentence pairs
Counting words...
Counted words:
eng 2953
fra 4540
start training...
load from ./checkpoints\seq2seq_translate-75000.pth
torch.Size([6, 1])
0m 0s (- 0m 0s) (75000 100%) 0.0332
> you re part of the problem .
= vous faites partie du probleme .
< je suis suis . <EOS>

> we re still here .
= nous sommes encore ici .
<



 je suis suis . <EOS>

> i m back .
= je suis revenu .
< je suis suis . <EOS>

> i am going to write a letter tomorrow .
= je vais ecrire une lettre demain .
< je suis suis . <EOS>

> i m not talking to you tom .
= je ne te parle pas tom .
< je suis suis . <EOS>

> we re relaxed .
= nous sommes detendus .
< je suis suis . <EOS>

> he s my hero .
= c est mon heros .
< je suis suis . <EOS>

> i m not joking .
= je ne blague pas .
< je suis suis . <EOS>

> they re boring .
= elles sont ennuyeuses .
< je suis suis . <EOS>

> i m not wearing socks .
= je ne porte pas de chaussettes .
< je suis suis . <EOS>

> you re very direct .
= vous etes tres directe .
< je suis suis . <EOS>



In [10]:
import time, sys

# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    barLength = 50 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "█"*block + "_"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

for i in range(100):
    time.sleep(0.1)
    update_progress((i+1)/100.0)

Percent: [██████████████████████████████████████████████████] 100% Done...99999%  


In [19]:
class ChatbotTrainDataset(Dataset):
    class Voc(object):
        def __init__(self, name):
            self.name = name
            self.word2index = {}
            self.word2count = {}
            self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"}
            self.n_words = 3  # Count SOS and EOS

        def add_sentence(self, sentence):
            for word in sentence.split(' '):
                self.add_word(word)

        def add_word(self, word):
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count[word] += 1
                
    def __init__(self,lang1,lang2,MAX_LENGTH = 10,reverse=False):
        self.MAX_LENGTH = MAX_LENGTH
        self.SOS_token = 0
        self.EOS_token = 1
        self.PAD_token = 2
        self.input_voc,self.output_voc,pairs = self.prepare_data(lang1,lang2,reverse)
        input_data = []
        output_data = []
        for pair in pairs:
            input_data.append(self.indexes_from_sentence(self.input_voc, pair[0])+[self.EOS_token])
            output_data.append(self.indexes_from_sentence(self.output_voc, pair[1])+[self.EOS_token])
        self.input_data = self.zeroPadding(input_data,self.PAD_token)
        self.output_data = self.zeroPadding(output_data,self.PAD_token)       
        
    def filter_pair(self, p):
        return len(p[0].split(' ')) < self.MAX_LENGTH and \
               len(p[1].split(' ')) < self.MAX_LENGTH 

    def filter_pairs(self, pairs):
        return [pair for pair in pairs if self.filter_pair(pair)]

    @staticmethod
    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    def normalize_string(self, s):
        s = self.unicode_to_ascii(s.strip()).lower().strip()        
        s = re.sub("([.!?])", " \1", s)
        s = re.sub("[^a-zA-Z.!?s]+", " ", s)
        return s.strip()

    def read_lang(self, lang1, lang2, reverse=False):
        print("Reading lines...")
        # combine every two lines into pairs and normalize
        with open('../data/interim/%s-%s.txt' % (lang1, lang2), encoding='utf-8') as f:
            content = f.readlines()
        lines = [x.strip() for x in content]
        it = iter(lines)
        pairs = [[self.normalize_string(x), self.normalize_string(next(it))] for x in it]
        if reverse:
            pairs = [list(reversed(p)) for p in pairs]
            input_voc = self.Voc(lang2)
            output_voc = self.Voc(lang1)
        else:
            input_voc = self.Voc(lang1)
            output_voc = self.Voc(lang2)
        return input_voc, output_voc, pairs

    def indexes_from_sentence(self, voc, sentence):
        return [voc.word2index[word] for word in sentence.split(' ')]
    
    # batch_first: true -> false, i.e. shape: seq_len * batch
    def zeroPadding(self,data, fillvalue):
        pad = len(max(data, key=len))        
        return np.array([i + [fillvalue]*(pad-len(i)) for i in data])

    def prepare_data(self, lang1, lang2, reverse=False):
        input_voc, output_voc, pairs = self.read_lang(lang1, lang2, reverse)
        print("Read %s sentence pairs" % len(pairs))
        pairs = self.filter_pairs(pairs)
        print("Trimmed to %s sentence pairs" % len(pairs))
        print("Counting words...")
        for pair in pairs:
            input_voc.add_sentence(pair[0])
            output_voc.add_sentence(pair[1])
        print("Counted words:")
        print(input_voc.name, input_voc.n_words)
        print(output_voc.name, output_voc.n_words)
        return input_voc,output_voc,pairs
        
    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.input_data[idx],self.output_data[idx]

NameError: name 'Dataset' is not defined