In [3]:
import tensorflow as tf
import numpy as np
import os
import sys
import re
import unicodedata
from sklearn.model_selection import train_test_split
from tensorflow.python.layers.core import Dense
from tqdm import tqdm

In [4]:
path_to_zip = tf.keras.utils.get_file('spa-eng.zip', 
                                      origin='http://download.tensorflow.org/data/spa-eng.zip', 
    extract=True)
path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [5]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [6]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    #w = '<start> ' + w + ' <end>'
    return w

In [7]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    return word_pairs

In [8]:
val =create_dataset(path_to_file, 5)
val

[['go .', 've .'],
 ['go .', 'vete .'],
 ['go .', 'vaya .'],
 ['go .', 'vayase .'],
 ['hi .', 'hola .']]

In [9]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang, target=False):
        self.lang = lang
        self.target = target
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
    def create_index(self):
        for phrase in self.lang:
          self.vocab.update(phrase.split(' '))
    
        self.vocab = sorted(self.vocab)
        
        if self.target:
            self.word2idx['<pad>'] = 0
            self.word2idx['<START>'] = 1
        else:
            self.word2idx['<pad>'] = 0
    
        
        for index, word in enumerate(self.vocab):
          self.word2idx[word] = index + 1
    
        for word, index in self.word2idx.items():
          self.idx2word[index] = word

In [10]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [11]:
def load_dataset(path, num_examples):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)
    
    # index language using the class defined above    
    inp_lang = LanguageIndex(sp for en, sp in pairs)
    targ_lang = LanguageIndex((en for en, sp in pairs),target=True)
    
    # Vectorize the input and target languages
    
    # Spanish sentences
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    #print(input_tensor)
    # English sentences
    target_tensor = [[1]+[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    target_tensor_dec = [[1]+[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar, max_length_tar_dec = max_length(input_tensor), max_length(target_tensor),\
    max_length(target_tensor_dec)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                maxlen=max_length_inp,
                                                                padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                 maxlen=max_length_tar,
                                                                 padding='post')
    target_tensor_dec = tf.keras.preprocessing.sequence.pad_sequences(target_tensor_dec,
                                                                     maxlen=max_length_tar+1,
                                                                     padding='post')
    return input_tensor, target_tensor,target_tensor_dec, inp_lang, targ_lang, max_length_inp, max_length_tar,\
max_length_tar_dec

In [12]:
# Try experimenting with the size of that dataset
num_examples = 3000
input_tensor, target_tensor, target_tensor_dec, inp_lang, targ_lang, max_length_inp, max_length_targ, max_length_tar_dec = \
load_dataset(path_to_file, num_examples)

In [13]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(2400, 2400, 600, 600)

In [14]:
def dataset(data1, data2, data3, buffer_size, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((data1,data2, data3)).shuffle(buffer_size)
    dataset = dataset.map(lambda x, y, z: (x,y,z, tf.size(x),tf.size(y), tf.size(z)))
    dataset = dataset.batch(batch_size)
    return dataset

In [27]:
class Config:
    
    
    learning_rate = 0.001
    batch_size = 128
    buffer_size = len(input_tensor_train)//batch_size
    unit = 1024
    embedding_dim = 300
    n_epoch = 2
        
    vocab_inp_size = len(inp_lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)
    reverse_dict_input = inp_lang.idx2word
    reverse_dict_target = targ_lang.idx2word
    max_dec_inp_len = max_length_targ
        
    cell = tf.nn.rnn_cell.BasicLSTMCell
    optimizer = tf.train.RMSPropOptimizer
    dataset = dataset(input_tensor,target_tensor,target_tensor_dec, buffer_size, batch_size)
    initializer = dataset.make_initializable_iterator()
    input_data, input_data_dec, target_data_dec, inp_len,inp_dec_len,tar_dec_len = \
    initializer.get_next()
        
    #checkpoint Path
    ckpt_dir = './ckpt_dir_seq'

In [28]:
def idx2token(idx, reverse_dict):
    return reverse_dict[idx]

def idx2sent(indices, reverse_dict):
    return ' '.join([idx2token(idx,reverse_dict) for idx in indices])

In [44]:
class seq2seq(object):
    
    def __init__(self, sess, config, mode):
        self.mode = mode
        self.learning_rate = config.learning_rate
        self.batch_size = config.batch_size
        self.n_epoch = config.n_epoch
        
        self.unit = config.unit
        self.embedding_dim = config.embedding_dim
        self.encoder_vocab_size = config.vocab_inp_size
        self.decoder_vocab_size = config.vocab_tar_size
        
        self.cell = config.cell
        self.optimizer = config.optimizer
        
        self.ckpt_path = config.ckpt_dir
        #encoder input 
        self.input_data = config.input_data
        self.inp_len = config.inp_len
        # decoder input
        self.input_data_dec = config.input_data_dec
        self.inp_dec_len = config.inp_dec_len
        self.tar_dec_len = config.tar_dec_len
        self.max_length_targ = config.max_dec_inp_len
        self.dec_reverse_dict = config.reverse_dict_target
        # dataset initializer
        self.initializer = config.initializer
        sess.run(tf.global_variables_initializer())
    
    def encoder(self):
        
        with tf.variable_scope('encoder') as encoder:
            self.encWemb = tf.get_variable(
                'embedding',
                initializer = tf.random_uniform([self.encoder_vocab_size, self.embedding_dim]),
                dtype = tf.float32
            )
        
        self.enc_cell = self.cell(self.unit)
        
        enc_emb_input = tf.nn.embedding_lookup(self.encWemb, self.input_data, name='encoder_input')
        
        self.enc_outputs, self.enc_last_state = tf.nn.dynamic_rnn(
            cell= self.enc_cell,
            inputs = enc_emb_input,
            sequense_length = sel.inp_len,
            time_major = False,
            dtype = tf.float32
        )
    
    def decoder(self):
        
        with tf.variable_scope('decoder') as decoder:
            self.decWemb = tf.get_variable(
                'embedding',
                initializer = tf.random_uniform([self.decoder_vocab_size, self.embedding_dim]),
                dtype = tf.float32
            )
        
        self.dec_cell = self.cell(self.unit)
        
        dec_emb_input = tf.nn.embedding_lookup(self.decWemb, self.input_data_dec)
        
        output_layer = Dense(self.decoder_vocab_size, name='output_projection')
        
        if self.mode == 'training':
            
            train_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_emb_input,
                                                             sequence_length=self.inp_dec_len)
            
            train_decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.dec_cell,
                                                            helper=train_helper,
                                                            initial_state=self.enc_last_state,
                                                            output_layer=output_layer)
            train_dec_output, train_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=train_decoder,
                impute_finished=True,
                maximum_iterations=self.max_length_targ)
            
            logits = tf.identity(train_dec_output.rnn_output, name='logits')
            
            masks = tf.sequence_mask(self.inp_dec_len, self.max_length_targ, dtype=float32,  name='masks')
            
            self.batch_loss = tf.contrib.seq2seq.sequence_loss(
                logits=logits,
                targets=self.input_data_dec,
                weights=masks,name='batch_loss'
            )
            
            self.valid_predictions = tf.identity(train_dec_output.sample_id, name='valid_preds')
        
        elif self.mode == 'inference':
            
            batch_size = tf.shape(self.input_data)[0:1]
            start_token = tf.ones(batch_size, dtype=tf.int32)
            
            infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=self.decWemb,
                start_tokens=start_token,
                end_token=0)
            
            infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = self.dec_cell,
                helper=infer_helper, 
                initial_state=self.enc_last_state,
                output_layer=output_layer)
            
            infer_dec_outputs, infer_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=infer.decoder,
                impute_finished=True,
                maximum_iterations=self.self.inp_dec_len)
            
            self.predictions = tf.identity(infer_dec_outputs.sample_id, name='predictions')

    def add_training_op(self):
        self.training_op = self.optimizer(self.learning_rate).minimize(self.batch_loss)
    
    
    def saver(self, sess, var_list=None, save_path=None):
        print('Saving model at {0}'.format(save_path))
        if hasattr(self, 'training_variables'):
            var_list = self.training_variables
        saver = tf.train.Saver(var_list=var_list)
        saver.save(sess, save_path, write_meta_graph=False)
        
    def restore(self, sess, var_list=None,save_path=None):
        if hasattr(self, 'training_variables'):
            var_list = self.training_variables
        self.restorer = tf.train.Saver(var_list)
        self.restorer.restore(sess, ckpt_path)
        print('Restore Finished!')
        
    def summary(self):
        summary_writer = tf.summary.FileWriter(
            logdir = self.ckpt_path,
            graph = tf.get_default_graph()
        )
        
    def build(self):
        self.encoder()
        self.decoder()
        
    
    def train(self,from_scratch=False, load_ckpt = None, save_path=None):
        
        if from_scratch is False and os.path.isfile(load_ckpt):
            self.restore(sess, load_ckpt)
        
        
        self.build()
        self.add_training_op()
        
        
        loss_history = []
        for epoch in tqdm(range(self.n_epoch)):
            all_preds = []
            epoch_loss = 0
            sess.run([self.initializer.initializer])
            try:
                while True:
                    batch_pred, batch_loss, _ = sess.run([self.valid_predictions, self.batch_loss, self.training_op])
                    epoch_loss += batch_loss
                    all_preds.append(batch_pred)
            except tf.errors.OutOfRangeError:
                pass
        
            loss_history.append(epoch_loss)
            
            if epoch%2 == 0:
                print('Epoch', epoch)
                for input_batch, target_batch, batch_pred in zip(self.input_data,self.input_data_dec,batch_pred):
                    for input_sent, target_sent, pred in zip(input_batch,target_batch, batch_pred):
                        print('\tinput sent', input_sent)
                        print('\tprediction', idx2sent(pred,self.dec_reverse_dict))
                        print('\tTarget:', target_sent)
                print('\tepoch loss: {epoch_loss:.2f}\n')
        if save_path:
            self.saver(sess,save_path)
        return loss_history
    
    def inference(self,load_ckpt):
        self.restore(sess, save_path=load_ckpt)
        
        batch_preds = []
        batch_tokens = []
        batch_sent_lens = []
        
        batch_pred = sess.run([self.predictions])
        for input_sent, target_sent, pred in zip(input_batch,target_batch, batch_pred):
            print('\tinput sent', input_sent)
            print('\tprediction', idx2sent(pred,self.dec_reverse_dict))
            print('\tTarget:', target_sent)
                        
        
        


In [45]:
tf.reset_default_graph()
with tf.Session() as sess:
    config = Config()
    model = seq2seq(sess, config, mode='training')
    model.train(from_scratch=True, save_path=model.ckpt_path+'epoch_'+str(model.n_epoch))
    print('Training model built!')

ValueError: Tensor("IteratorGetNext:0", shape=(?, 9), dtype=int32) must be from the same graph as Tensor("encoder/embedding:0", shape=(1907, 300), dtype=float32_ref).

In [74]:
sess = tf.Session()

In [75]:
config = Config()

In [80]:
sess.run([config.initializer.initializer])
sess.run([config.input_data])

[array([[ 792,    1,    0, ...,    0,    0,    0],
        [ 406,    1,    0, ...,    0,    0,    0],
        [1821,    3,    0, ...,    0,    0,    0],
        ...,
        [1859,    5,  281, ...,    0,    0,    0],
        [1123,  899, 1183, ...,    0,    0,    0],
        [  11,    5, 1750, ...,    0,    0,    0]])]