In [1]:
def _data_train(fn):
    with open(fn, 'r') as fn:
        train = fn.readlines()
    train = [item[:-1] for item in train[:40000]]
    return train


train = _data_train(fn='phap-luat.txt')
print('length of train: {}'.format(len(train)))

length of train: 9453


In [2]:
# encoding=utf8
import codecs
import csv
import re
import sys


# sys.setdefaultencoding('utf8')

def remove_tone_line(utf8_str):
    intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
    intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    intab = list(intab_l + intab_u)

    outtab_l = "a" * 17 + "o" * 17 + "e" * 11 + "u" * 11 + "i" * 5 + "y" * 5 + "d"
    outtab_u = "A" * 17 + "O" * 17 + "E" * 11 + "U" * 11 + "I" * 5 + "Y" * 5 + "D"
    outtab = outtab_l + outtab_u

    r = re.compile("|".join(intab))
    replaces_dict = dict(zip(intab, outtab))
    return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)


remove_tone_line('Đi một ngày đàng học 1 sàng khôn')

'Di mot ngay dang hoc 1 sang khon'

In [3]:
# Tách dấu ra khỏi từ
def normalizeString(s):
    # Tách dấu câu nếu kí tự liền nhau
    marks = '[.!?,-${}()]'
    r = "([" + "\\".join(marks) + "])"
    s = re.sub(r, r" \1 ", s)
    # Thay thế nhiều spaces bằng 1 space.
    s = re.sub(r"\s+", r" ", s).strip()
    return s


normalizeString('vui vẻ, hòa đồng, hoạt bát')

  s = re.sub(r, r" \1 ", s)


'vui vẻ , hòa đồng , hoạt bát'

In [4]:
import itertools

train = [normalizeString(item) for item in train]
train_rev_accent = [remove_tone_line(item) for item in train]

print('train top 5:', train[:5])
print('train_rev_accent top 5:', train_rev_accent[:5])

train top 5: ['Phát hiện xe đò buộc hành khách trên mui , phủ bạt ( NLĐ ) - Hồi 9 giờ 30 phút ngày 13 - 2 , trong lúc làm nhiệm vụ trên tuyến Quốc lộ 1A thuộc địa phận thị trấn Đồng Cát , huyện Mộ Đức ( Quảng Ngãi ) , lực lượng cảnh sát giao thông và Thanh tra Giao thông Quảng Ngãi đã phát hiện xe khách mang biển kiểm soát 36L - 6803 do Lê Đình Thịnh ( 1977 ) , thường trú tại Thanh Hóa điều khiển chạy hướng Bắc - Nam chở đến 122 người .', 'Ngoài 115 người chật cứng trong xe , nhà xe còn đưa 7 hành khách khác lên nằm trên mui xe , dùng dây buộc và phủ bạt nhằm qua mắt các lực lượng kiểm soát . Điều đáng nói là số hành khách nằm trên mui xe đi suốt từ Thanh Hóa vào Quảng Ngãi thì mới bị phát hiện . . Lúc 15 giờ 30 phút ngày 13 - 2 , trong lúc đi tuần tra trên tuyến đường Ông Ích Đường , Công an quận Cẩm Lệ , TP Đà Nẵng đã phát hiện chiếc xe khách 38H - 4931 do tài xế Trần Quang Vinh ( sinh 1967 , Hà Tĩnh ) điều khiển , chạy tuyến Hà Tĩnh - TPHCM , trên xe có tới 100 hành khách , trong kh

In [5]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token


class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [6]:
def _ngram(text, length=4):
    words = text.split()
    grams = []
    if len(words) <= length:
        words = words + ["PAD"] * (length - len(words))
        return [' '.join(words)]
    else:
        for i in range(len(words) - length + 1):
            grams.append(' '.join(words[i:(i + length)]))
        return grams


print(_ngram('mùa đông năm nay không còn lạnh nữa. Vì đã có gấu 37 độ ấm'))
print(_ngram('mùa đông'))

['mùa đông năm nay', 'đông năm nay không', 'năm nay không còn', 'nay không còn lạnh', 'không còn lạnh nữa.', 'còn lạnh nữa. Vì', 'lạnh nữa. Vì đã', 'nữa. Vì đã có', 'Vì đã có gấu', 'đã có gấu 37', 'có gấu 37 độ', 'gấu 37 độ ấm']
['mùa đông PAD PAD']


In [7]:
import itertools

train_grams = list(itertools.chain.from_iterable([_ngram(item) for item in train]))
train_rev_acc_grams = list(itertools.chain.from_iterable([_ngram(item) for item in train_rev_accent]))


In [8]:
corpus = list(zip(train_rev_acc_grams, train_grams))
corpus[:5]

[('Phat hien xe do', 'Phát hiện xe đò'),
 ('hien xe do buoc', 'hiện xe đò buộc'),
 ('xe do buoc hanh', 'xe đò buộc hành'),
 ('do buoc hanh khach', 'đò buộc hành khách'),
 ('buoc hanh khach tren', 'buộc hành khách trên')]

In [9]:
import unicodedata
import os

MAX_LENGTH = 4  # Maximum sentence length to consider


# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    # Tách dấu câu nếu kí tự liền nhau
    s = re.sub(r"([.!?,\-\&\(\)\[\]])", r" \1 ", s)
    # Thay thế nhiều spaces bằng 1 space.
    s = re.sub(r"\s+", r" ", s).strip()
    return s


# Read query/response pairs and return a voc object
def readVocs(lines, corpus_name='corpus'):
    # Split every line into pairs and normalize
    pairs = [[normalizeString(str(s)) for s in l] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs


voc, pairs = readVocs(corpus)


# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH


# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


# # Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(voc, pairs):
    print("Read {!s} sentence pairs".format(len(pairs)))
    # pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(voc, pairs)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Read 981096 sentence pairs
Trimmed to 981096 sentence pairs
Counting words...
Counted words: 22043

pairs:
['Phat hien xe do', 'Phát hiện xe đò']
['hien xe do buoc', 'hiện xe đò buộc']
['xe do buoc hanh', 'xe đò buộc hành']
['do buoc hanh khach', 'đò buộc hành khách']
['buoc hanh khach tren', 'buộc hành khách trên']
['hanh khach tren mui', 'hành khách trên mui']
['khach tren mui ,', 'khách trên mui ,']
['tren mui , phu', 'trên mui , phủ']
['mui , phu bat', 'mui , phủ bạt']
[', phu bat (', ', phủ bạt (']


In [10]:
MIN_COUNT = 3  # Minimum word count threshold for trimming


def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs),
                                                                len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 21411 / 22040 = 0.9715
Trimmed from 981096 pairs to 980513, 0.9994 of total


In [11]:
print('EOS_token: ', EOS_token)
print('SOS_token: ', SOS_token)
print('PAD_token: ', PAD_token)

EOS_token:  2
SOS_token:  1
PAD_token:  0


In [12]:
import random
import torch


def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


# Padding thêm 0 vào list nào có độ dài nhỏ hơn về phía bên phải
def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))


# Tạo ma trận binary có kích thước như ma trận truyền vào l nhưng giá trị của mỗi phần tử đánh dấu 1 hoặc 0 tương ứng với padding hoặc không padding
def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m


# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths


# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len


# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 4
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

In [13]:
print("input_variable: \n", input_variable)
print("lengths: \n", lengths)
print("target_variable: \n", target_variable)
print("mask: \n", mask)
print("max_target_len: \n", max_target_len)

input_variable: 
 tensor([[1110,  304,  239,  213],
        [ 188,  592,  213, 1044],
        [4490, 1448, 1044,  619],
        [ 434,  145,  349, 2285],
        [   2,    2,    2,    2]])
lengths: 
 tensor([5, 5, 5, 5])
target_variable: 
 tensor([[1111,  305,  240,  213],
        [ 189,  592,  213, 1226],
        [4491, 1449, 1226, 1237],
        [ 435,  854,  350, 3190],
        [   2,    2,    2,    2]])
mask: 
 tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]], dtype=torch.uint8)
max_target_len: 
 5


In [14]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # set bidirectional = True for bidirectional
        # https://pytorch.org/docs/stable/nn.html?highlight=gru#torch.nn.GRU to get more information
        self.gru = nn.GRU(input_size=hidden_size,  # number of expected feature of input x 
                          hidden_size=hidden_size,  # number of expected feature of hidden state 
                          num_layers=n_layers,  # number of GRU layers
                          dropout=(0 if n_layers == 1 else dropout),  # dropout probability apply in encoder network
                          bidirectional=True  # one or two directions.
                          )

    def forward(self, input_seq, input_lengths, hidden=None):
        # Step 1: Convert word indexes to embeddings
        # shape: (max_length , batch_size , hidden_size)
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module. Padding zero when length less than max_length of input_lengths.
        # shape: (max_length , batch_size , hidden_size)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Step 2: Forward packed through GRU
        # outputs is output of final GRU layer
        # hidden is concatenate of all hidden states corresponding with each time step.
        # outputs shape: (max_length , batch_size , hidden_size x num_directions)
        # hidden shape: (n_layers x num_directions , batch_size , hidden_size)
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding. Revert of pack_padded_sequence
        # outputs shape: (max_length , batch_size , hidden_size x num_directions)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs to reshape shape into (max_length , batch_size , hidden_size)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        # outputs shape:(max_length , batch_size , hidden_size)
        # hidden shape: (n_layers x num_directions , batch_size , hidden_size)
        return outputs, hidden

In [16]:
# Thử nghiệm phrase Encoder bằng cách giả lập 1 mạng Encoder với:
from torch import nn

hidden_size = 3
n_layers = 7
embedding = nn.Embedding(voc.num_words, hidden_size)
print('input_seq: \n', input_variable)
print('input_lengths: \n', lengths)
encoder = EncoderRNN(hidden_size=hidden_size, embedding=embedding, n_layers=n_layers)

print('encoder phrase: \n', encoder)

output, hidden = encoder.forward(input_seq=input_variable, input_lengths=lengths)

input_seq: 
 tensor([[1110,  304,  239,  213],
        [ 188,  592,  213, 1044],
        [4490, 1448, 1044,  619],
        [ 434,  145,  349, 2285],
        [   2,    2,    2,    2]])
input_lengths: 
 tensor([5, 5, 5, 5])
encoder phrase: 
 EncoderRNN(
  (embedding): Embedding(21414, 3)
  (gru): GRU(3, 3, num_layers=7, bidirectional=True)
)


In [17]:
print('output size: ', output.size())
print('hidden size: ', hidden.size())

output size:  torch.Size([5, 4, 3])
hidden size:  torch.Size([14, 4, 3])


In [18]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        # encoder_output shape:(max_length , batch_size , hidden_size)
        # hidden shape: (1 , batch_size , hidden_size)
        # return shape: (max_length, batch_size)
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        # encoder_output shape:(max_length , batch_size , hidden_size)
        # hidden shape: (batch_size , hidden_size)
        # energy shape: (max_length , batch_size , hidden_size)
        # return shape: (max_length , batch_size)
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        # encoder_output shape:(max_length , batch_size , hidden_size)
        # hidden shape: (batch_size , hidden_size)
        # energy shape: (max_length , batch_size , 2*hidden_size)
        # self.v shape: (hidden_size)
        # return shape: (max_length , batch_size)
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        # attn_energies.shape: (max_length , batch_size)
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        # attn_energies.shape: (batch_size , max_length)
        attn_energies = attn_energies.t()
        # Return the softmax normalized probability scores (with added dimension)
        attn_weights = F.softmax(attn_energies, dim=1).unsqueeze(1)
        # attn_weights shape: (batch_size , 1 , max_length)
        return attn_weights

In [19]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        '''
        input_step: list time step index of batch. shape (1 x batch_size)
        last_hidden: last hidden output of hidden layer (we can take in right direction or left direction upon us) which have shape = (n_layers x batch_size x hidden_size)
        encoder_outputs: output of encoder 
        '''
        #===========================================
        # Step 1: Embedding current sequence index
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        # embedded shape: 1 x batch_size x hidden_size
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)

        #===========================================
        # Step 2: pass embedded and last hidden into decoder
        # Forward through unidirectional GRU
        # rnn_output shape: 1 x batch_size x hidden_size
        # hidden shape: n_layers x batch_size x hidden_size
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        # attn_weights shape: batch_size x 1 x max_length
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        # encoder_outputs shape: max_length x batch_size x hidden_size
        # context shape: batch_size x 1 x hidden_size
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        # rnn_output shape: batch_size x hidden_size
        rnn_output = rnn_output.squeeze(0)
        # context shape: batch_size x hidden_size
        context = context.squeeze(1)

        #===========================================
        # Step 3: calculate output probability distribution 
        # concat_input shape: batch_size x (2*hidden_size)
        concat_input = torch.cat((rnn_output, context), 1)
        # concat_output shape: batch_size x hidden_size
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        # output shape: output_size
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [20]:
time_step = 0
# Take all index of batch at time step 0. All words are <SOS> mark for start of sentences.
input_step = torch.tensor([SOS_token] * small_batch_size).unsqueeze(0)
n_layers = 7
# take last hidden vector of encoder
last_hidden = hidden[:n_layers]
print('batch_size: ', small_batch_size)
print('input_step.size at time_step 0: ', input_step.size())
print('last_hidden.size: ', last_hidden.size())
attn_model = 'dot'
hidden_size = 3
# Output size of decoder model is size of vocabulary
output_size = len(voc.word2index)

luongAttnDecoderRNN = LuongAttnDecoderRNN(attn_model=attn_model,
                                          embedding=embedding,
                                          hidden_size=hidden_size,
                                          output_size=output_size,
                                          n_layers=n_layers)

print('luongAttnDecoderRNN phrase: \n', luongAttnDecoderRNN)
dec_output, dec_hidden = luongAttnDecoderRNN.forward(input_step=input_step,
                                                     last_hidden=last_hidden,
                                                     encoder_outputs=output)

batch_size:  4
input_step.size at time_step 0:  torch.Size([1, 4])
last_hidden.size:  torch.Size([7, 4, 3])
luongAttnDecoderRNN phrase: 
 LuongAttnDecoderRNN(
  (embedding): Embedding(21414, 3)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(3, 3, num_layers=7, dropout=0.1)
  (concat): Linear(in_features=6, out_features=3, bias=True)
  (out): Linear(in_features=3, out_features=21411, bias=True)
  (attn): Attn()
)


In [21]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [22]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [23]:
def trainIters(model_name, voc, pairs, encoder, decoder,
               encoder_optimizer, decoder_optimizer,
               embedding, encoder_n_layers, decoder_n_layers,
               save_dir, n_iteration, batch_size, print_every,
               save_every, clip, corpus_name, loadFilename):
    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        print(mask)
        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration,
                                                                                          iteration / n_iteration * 100,
                                                                                          print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if iteration % save_every == 0:
            directory = os.path.join(save_dir, model_name, corpus_name,
                                     '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [24]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [25]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while (1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [26]:
# Configure models
model_name = 'correct_spelling_model'
corpus_name = 'corpus_aivivn'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 1000

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 5000

# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
# if loadFilename:
#     embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [27]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 500
print_every = 100
save_every = 200

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]], dtype=torch.uint8)


RuntimeError: masked_select: expected BoolTensor for mask

In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)
