In [9]:
import numpy as np
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import tensorflow as tf
import numpy as np
from collections import Counter
import math

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [2]:
# Read in ENG and SPAN data

def read_list_of_sentences(filename):
    with open(filename, 'r') as f:
        content = f.readlines()
        content = [line.strip('\n') for line in content]
    return content

eng_list = read_list_of_sentences('../data/raw/europarl-v7.es-en.en')
span_list = read_list_of_sentences('../data/raw/europarl-v7.es-en.es')

In [3]:
import string

def clean_sentence_lists(source_list, target_list, max_len=64):
    source_clean, target_clean = list(), list()
    punctuation_translator = str.maketrans('','',string.punctuation)
    punctuation_translator[191] = None  # to remove inverted question mark
    for source, target in zip(source_list, target_list):
        if len(source.split()) < (max_len-1) and len(target.split()) < (max_len-1):
            if source is not '' and target is not '':
                source = source.translate(punctuation_translator)
                source = source.replace(" s ", "'s ")
                target = target.translate(punctuation_translator)
                target = target.replace(" s ", "'s ")
    #             source = pad_to_sequence_length(source, max_len)
    #             target = pad_to_sequence_length(target, max_len)
                source_clean.append(source.lower())
                target_clean.append(target.lower())
    return source_clean, target_clean

# def pad_source_sequence_length(sentence, max_len):
#     padding_length = target_length - len(sentence.split())
#     sentence = sentence + ' <PAD>' * padding_length
#     sentence = ' '.join(sentence.split()[::-1])
#     return sentence
    
# def pad_target_sequence_length(sentence, target_length):
#     padding_length = target_length - len(sentence.split())
#     sentence = sentence ' <PAD>' * padding_length
#     return sentence

eng_list, span_list = clean_sentence_lists(eng_list, span_list)

In [78]:
# Build vocabularies for english and spanish
def build_vocabulary(sentence_list, vocabulary_size=50000):
    tokens = [('<UNK>', None), ('<PAD>', None), ('<EOS>', None), ('<GO>', None)]
    vocabulary_size = vocabulary_size - len(tokens)
    word_list = [word for line in sentence_list for word in line.split()]
    vocabulary = tokens + Counter(word_list).most_common(vocabulary_size)
    vocabulary = np.array([word for word, _ in vocabulary])
    dictionary = {word: code for code, word in enumerate(vocabulary)}
    return dictionary, vocabulary     

eng_dictionary, eng_vocabulary = build_vocabulary(eng_list)
span_dictionary, span_vocabulary = build_vocabulary(span_list)

In [30]:
def create_bucket_dict(eng_sentences, span_sentences):
    # returns dict{10: indices, 20: indices ...}
    sample_bucket_sizes = []
    bucket_dict = {}
    for eng_sentence, span_sentence in zip(eng_sentences, span_sentences):
        max_len = max(len(eng_sentence.split()), len(span_sentence.split()))
        rounded_max_len = roundup(max_len)
        sample_bucket_sizes.append(rounded_max_len)
    for i in range(10,max(sample_bucket_sizes)+1, 10):
        bucket_dict[i] = create_buckets(sample_bucket_sizes, i)
        
    return bucket_dict

def roundup(x):
    return int(math.ceil((x+1) / 10.0)) * 10 # x+1 to push *0 into next bucket to account for tokens     

def create_buckets(buckets, bucket_len):
    return [index for index, value in enumerate(buckets) if value == bucket_len]
 
bucket_dict = create_bucket_dict(eng_list, span_list)

In [84]:
# Add padding and tokens
def add_tokens_to_text(source_list, target_list, bucket_dict):
    number_of_samples = len(source_list)
    source_final, target_input_final, target_output_final = [None] * number_of_samples, [None] * number_of_samples, [None] * number_of_samples
    inverse_bucket_dict = invert(bucket_dict)
    for index, bucket_size in inverse_bucket_dict.items():
        source_final[index] = pad_source_sentences(source_list[index], bucket_size)
        target_input_final[index] = pad_target_input_sentences(target_list[index], bucket_size)
        target_output_final[index] = pad_target_output_sentences(target_list[index], bucket_size)
    source_final_numerical = convert_words_to_numerical_id(source_final, eng_dictionary)
    target_input_final_numerical = convert_words_to_numerical_id(target_input_final, span_dictionary)
    target_output_final_numerical = convert_words_to_numerical_id(target_output_final, span_dictionary)
    
    return source_final_numerical, target_input_final_numerical, target_output_final_numerical

def pad_source_sentences(sentence, bucket_size):
    sentence_length = len(sentence.split())
    pad_length = bucket_size - sentence_length
    return sentence + ' <PAD>' * pad_length

def pad_target_input_sentences(sentence, bucket_size):
    sentence_length = len(sentence.split())
    pad_length = bucket_size - sentence_length - 1
    return '<GO> ' + sentence + ' <PAD>' * pad_length

def pad_target_output_sentences(sentence, bucket_size):
    sentence_length = len(sentence.split())
    pad_length = bucket_size - sentence_length - 1
    return sentence + ' <EOS> ' + ' <PAD>' * pad_length
    

def invert(d):
    return dict( (v,k) for k in d for v in d[k] )


def convert_words_to_numerical_id(sentence_list, dictionary):
    out = []
    for sentence in sentence_list:
        out.append([dictionary[word] if word in dictionary else dictionary['<UNK>'] for word in sentence.split()])
        
    return out


X_in, y_in, y_out = add_tokens_to_text(eng_list, span_list, bucket_dict)

In [6]:
# Batch Generator

import random
from collections import deque

def generate_batch(batch_size, num_skips, skip_window, data):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

# Embedding model

In [7]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

learning_rate = 0.01

reset_graph()

# Input data.
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

vocabulary_size = 50000
embedding_size = 150

# Look up embeddings for inputs.
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)

train_inputs = tf.placeholder(tf.int32, shape=[None])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
    tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
                   num_sampled, vocabulary_size))

# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Add variable initializer.
init = tf.global_variables_initializer()

# Train

In [8]:
# num_steps = 50001
# data_index=0

# with tf.Session() as sess:
#     sess.run(init)

#     average_loss = 0
#     for step in range(num_steps):
#         print("\rIteration: {}".format(step), end="\t")
#         batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window, eng_numerical_id)
#         feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

#         # We perform one update step by evaluating the training op (including it
#         # in the list of returned values for session.run()
#         _, loss_val = sess.run([training_op, loss], feed_dict=feed_dict)
#         average_loss += loss_val

#         if step % 2000 == 0:
#             if step > 0:
#                 average_loss /= 2000
#             # The average loss is an estimate of the loss over the last 2000 batches.
#             print("Average loss at step ", step, ": ", average_loss)
#             average_loss = 0

#         # Note that this is expensive (~20% slowdown if computed every 500 steps)
#         if step % 10000 == 0:
#             sim = similarity.eval()
#             for i in range(valid_size):
#                 valid_word = eng_vocabulary[valid_examples[i]]
#                 top_k = 8 # number of nearest neighbors
#                 nearest = (-sim[i, :]).argsort()[1:top_k+1]
#                 log_str = "Nearest to %s:" % valid_word
#                 for k in range(top_k):
#                     close_word = eng_vocabulary[nearest[k]]
#                     log_str = "%s %s," % (log_str, close_word)
#                 print(log_str)

#     eng_embeddings = normalized_embeddings.eval()

In [9]:
# num_steps = 50001
# data_index=0

# with tf.Session() as sess:
#     sess.run(init)

#     average_loss = 0
#     for step in range(num_steps):
#         print("\rIteration: {}".format(step), end="\t")
#         batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window, span_numerical_id)
#         feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

#         # We perform one update step by evaluating the training op (including it
#         # in the list of returned values for session.run()
#         _, loss_val = sess.run([training_op, loss], feed_dict=feed_dict)
#         average_loss += loss_val

#         if step % 2000 == 0:
#             if step > 0:
#                 average_loss /= 2000
#             # The average loss is an estimate of the loss over the last 2000 batches.
#             print("Average loss at step ", step, ": ", average_loss)
#             average_loss = 0

#         # Note that this is expensive (~20% slowdown if computed every 500 steps)
#         if step % 10000 == 0:
#             sim = similarity.eval()
#             for i in range(valid_size):
#                 valid_word = span_vocabulary[valid_examples[i]]
#                 top_k = 8 # number of nearest neighbors
#                 nearest = (-sim[i, :]).argsort()[1:top_k+1]
#                 log_str = "Nearest to %s:" % valid_word
#                 for k in range(top_k):
#                     close_word = span_vocabulary[nearest[k]]
#                     log_str = "%s %s," % (log_str, close_word)
#                 print(log_str)

#     span_embeddings = normalized_embeddings.eval()

In [10]:
# np.save('eng_embeddings', eng_embeddings)
# np.save('span_embeddings', span_embeddings)
eng_embeddings = np.load('eng_embeddings.npy')
span_embeddings = np.load('eng_embeddings.npy')


In [11]:
# def plot_with_labels(low_dim_embs, labels):
#     assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
#     plt.figure(figsize=(18, 18))  #in inches
#     for i, label in enumerate(labels):
#         x, y = low_dim_embs[i,:]
#         plt.scatter(x, y)
#         plt.annotate(label,
#                      xy=(x, y),
#                      xytext=(5, 2),
#                      textcoords='offset points',
#                      ha='right',
#                      va='bottom')

In [12]:
# from sklearn.manifold import TSNE

# tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
# plot_only = 500
# low_dim_embs = tsne.fit_transform(span_embeddings[:plot_only,:])
# labels = [span_vocabulary[i] for i in range(plot_only)]
# plot_with_labels(low_dim_embs, labels)

# nmt batch generator goes here

In [14]:
def get_next_batch(indices, iteration, batch_size, x_emb=eng_embeddings,
                   y_emb=span_embeddings, eng_dict=eng_dictionary, span_dict=span_dictionary,
                  eng_sentence_list=eng_list, span_sentence_list=span_list):
    
    go = span_dict['<GO>']   # int index in vocab
    eos = span_dict['<EOS>']
    
    # indices into sentence lists
#     extract_indices = list(range(iteration*batch_size,(iteration+1)*batch_size))
    extract_indices = indices[iteration]
    
    # single english string
    eng_sent = eng_sentence_list[extract_indices]
    # single spanish string
    span_sent = span_sentence_list[extract_indices]
    
    # vocabulary vector of words in sentences
    eng_word_vector = get_indices_from_words(eng_sent, eng_dict)
    span_word_vector_start = [go] + get_indices_from_words(span_sent, span_dict)
    span_word_vector_end = get_indices_from_words(span_sent, span_dict) + [eos]
    
    # 
    X_batch = np.flip(np.array([x_emb[i,:] for i in eng_word_vector]), axis=0)  #good
    y_input_batch = np.asarray([y_emb[i,:] for i in span_word_vector_start])
    y_target_batch = np.asarray([y_emb[i,:] for i in span_word_vector_end])
    
    y_one_hot = np.zeros((50000, len(span_sent.split())+1), dtype=np.float32) 
    
    for i in range(len(span_sent.split())):
        y_one_hot[span_word_vector_end[i],i] = 1
        
    y_one_hot[go,-1] = 1
    y_one_hot = np.array((y_one_hot))
    
    sequence_length = len(eng_sent.split())
    
    return X_batch, y_input_batch, y_target_batch, y_one_hot, sequence_length

def get_indices_from_words(sentence, dictionary):
    indices = list()
    for word in sentence.split():
        try:
            indices.append(dictionary[word])
        except:
            indices.append(dictionary['<UNK>'])
    return indices
    

# NMT graph

In [21]:
reset_graph()

n_steps = 100   # sequence length
n_inputs = 150     # embedding vector length
n_neurons = 128    # whatever
vocab_size = 50000 # vocab size / length of one-hot vector

X = tf.placeholder(tf.float32, [None, None, n_inputs])              # 
y_input = tf.placeholder(tf.float32, [None, None, n_inputs])
y_target = tf.placeholder(tf.float32, [None, None, n_inputs])
y_one_hot = tf.placeholder(tf.float32, [None, vocab_size])
seq_length = tf.placeholder(tf.int32)
target_weights = tf.placeholder(tf.float32, [None])

lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)

# Encoder
_, state = tf.nn.dynamic_rnn(lstm_cell, X, dtype=tf.float32, sequence_length=seq_length)

# Decoder
output_cell = tf.contrib.rnn.OutputProjectionWrapper(tf.contrib.rnn.LSTMCell(num_units=n_neurons),
                                                     output_size=vocab_size)

outputs, _ = tf.nn.dynamic_rnn(output_cell, y_input, dtype=tf.float32)

learning_rate = 0.001

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=y_one_hot))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

In [23]:
batch_size = 1
n_iterations = 10000 #len(eng_list) // batch_size
n_epochs = 1

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        shuffled_indices = np.random.permutation(len(eng_list))
        for iteration in range(n_iterations):
            try:
                X_batch, y_input_batch, y_target_batch, y_target_one_hot, sequence_length = get_next_batch(shuffled_indices,
                                                                                     iteration, batch_size)
                feed_dict = {X: X_batch[np.newaxis, :, :], 
                             y_input: y_input_batch[np.newaxis, :, :], 
                             y_target: y_target_batch[np.newaxis, :, :],
                             y_one_hot: np.transpose(y_target_one_hot),
                             seq_length: sequence_length}
                sess.run(training_op, feed_dict=feed_dict)
            except:
                pass
            if iteration % 100 == 0:        
                mse = loss.eval(feed_dict=feed_dict)
                print('Iteration: ', iteration, 'Loss: ', mse)

Iteration:  0 Loss:  10.811845
Iteration:  100 Loss:  7.6193295
Iteration:  200 Loss:  6.7295575
Iteration:  300 Loss:  8.400658
Iteration:  400 Loss:  6.26856
Iteration:  500 Loss:  6.404021
Iteration:  600 Loss:  7.0464344
Iteration:  700 Loss:  7.4069743
Iteration:  800 Loss:  6.8112836
Iteration:  900 Loss:  7.7679157
Iteration:  1000 Loss:  7.232817
Iteration:  1100 Loss:  7.0093536
Iteration:  1200 Loss:  6.062546
Iteration:  1300 Loss:  6.7613754
Iteration:  1400 Loss:  5.801794
Iteration:  1500 Loss:  6.5890703
Iteration:  1600 Loss:  5.762847
Iteration:  1700 Loss:  8.57424
Iteration:  1800 Loss:  5.79988
Iteration:  1900 Loss:  7.008194
Iteration:  2000 Loss:  7.649724
Iteration:  2100 Loss:  6.621567
Iteration:  2200 Loss:  5.9604545
Iteration:  2300 Loss:  6.380586
Iteration:  2400 Loss:  6.788675
Iteration:  2500 Loss:  6.6475773
Iteration:  2600 Loss:  7.1200776
Iteration:  2700 Loss:  6.5716634
Iteration:  2800 Loss:  6.4589496
Iteration:  2900 Loss:  6.2228885
Iteration