First we must download the data!

In [1]:
import requests
from pathlib import Path

dataURL = "http://www.statmt.org/europarl/v7/pt-en.tgz"
fileName = "pt-en.tgz"
tarFile = Path(fileName)
if tarFile.is_file():
    print("Already downloaded")
else:
    r = requests.get(dataURL)
    with open(fileName, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)

Already downloaded


In [3]:
from nltk.tokenize import word_tokenize

def tokenize(sentance, language):
    tokens = word_tokenize(sentance.decode('utf-8'), language=language)
    return [word.lower() for word in tokens if word.isalpha()]

In [4]:
def getLineLengths(corpusFile, language, outfile):
    lengths = {}
    with open(corpusFile) as corpus:
      for line in corpus:
        tokens = word_tokenize(line, language=language)
        words = [word.lower() for word in tokens if word.isalpha()]
        length = len(words)
        if length in lengths:
          lengths[length] += 1
        else:
          lengths[length] = 1
    with open(outfile, "w") as outfile:
      for length,count in lengths.items():
        outfile.write(str(length) + "," + str(count) + "\n")
      
getLineLengths("europarl-v7.pt-en.en", "english", "linelen.en")
getLineLengths("europarl-v7.pt-en.pt", "portuguese", "linelen.pt")

KeyboardInterrupt: 

Next we must extract the data!

In [2]:
import tarfile

enFile = Path("europarl-v7.pt-en.en")
ptFile = Path("europarl-v7.pt-en.pt")
if enFile.is_file() and ptFile.is_file():
    print("Already extracted")
else:
    with tarfile.open(fileName) as tar:
        tar.extractall()

Already extracted


Next we want to create our vocabularies in both languages. Lets go with something small to begin with, how about 20,000 words?

In [6]:
from nltk.tokenize import word_tokenize
# Special vocabulary symbols - we always put them at the start.
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
startVocab = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

def createVocabulary(corpusFile, vocabFile, numWords, language):
    vocab = {}
    with open(corpusFile) as corpus:
      for line in corpus:
        tokens = word_tokenize(line, language=language)
        for word in tokens:
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      orderedWords = startVocab + sorted(vocab, key=vocab.get, reverse=True)
      if len(orderedWords) > numWords:
        orderedWords = orderedWords[:numWords]
      with open(vocabFile, 'w') as file:
        for word in orderedWords:
          file.write(word + "\n")


def createVocabularyCount(corpusFile, countFile, language):
    vocab = {}
    with open(corpusFile) as corpus:
      for line in corpus:
        words = word_tokenize(line, language=language)
        words = [word.lower() for word in tokens if word.isalpha()]
        for word in words:
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      with open(countFile, 'w') as file:
        for word, count in vocab.items():
          file.write(word + "\t" + str(count) + "\n")

# createVocabularyCount("europarl-v7.pt-en.en", "counts2.en", "english")
# createVocabularyCount("europarl-v7.pt-en.pt", "counts2.pt", "portuguese")

enVocab = Path("vocab.en")
ptVocab = Path("vocab.pt")

if enVocab.is_file():
    print("Already did English")
else:
    print("Starting English vocab")
    createVocabulary("europarl-v7.pt-en.en", "vocab.en", 20000, "english")
    print("Finished English vocab")
if ptVocab.is_file():
    print("Already did Portuguese")
else:
    print("Starting Portuguese vocab")
    createVocabulary("europarl-v7.pt-en.pt", "vocab.pt", 20000, "portuguese")
    print("Finished Portuguese vocab")

Already did English
Already did Portuguese


In [7]:
def mapCorpus(corpusFile, vocabFile, mappedFile, language):
    words = []
    with open(vocabFile) as v:
      words.extend(v.readlines())
    words = [line.strip() for line in words]
    vocab = dict([(x, y) for (y, x) in enumerate(words)])
    with open(corpusFile) as corpus:
        with open(mappedFile, "w") as output:
            for line in corpus:
                tokens = word_tokenize(line, language=language)
                ids = [str(vocab.get(token, UNK_ID)) for token in tokens]
                output.write(" ".join(ids) + "\n")

enVocab = Path("mapped.en")
ptVocab = Path("mapped.pt")

if enVocab.is_file():
    print("Already did English")
else:
    mapCorpus("europarl-v7.pt-en.en", "vocab.en", "mapped.en", "english")
    
if ptVocab.is_file():
    print("Already did Portuguese")
else:
    print("Starting Portuguese map")
    mapCorpus("europarl-v7.pt-en.pt", "vocab.pt", "mapped.pt", "portuguese")

Already did English
Already did Portuguese


In [8]:
def create_model(session, forward_only):
  """Create translation model and initialize or load parameters in session."""
  dtype = tf.float32
  model = seq2seq_model.Seq2SeqModel(
      FLAGS.en_vocab_size,
      FLAGS.pt_vocab_size,
      _buckets,
      FLAGS.size,
      FLAGS.num_layers,
      FLAGS.max_gradient_norm,
      FLAGS.batch_size,
      FLAGS.learning_rate,
      FLAGS.learning_rate_decay_factor,
      forward_only=forward_only,
      dtype=dtype)
  ckpt = tf.train.get_checkpoint_state("checkpoint")
  if ckpt:
    print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
    model.saver.restore(session, ckpt.model_checkpoint_path)
  else:
    print("Created model with fresh parameters.")
    session.run(tf.initialize_all_variables())
  return model

In [6]:
import tensorflow as tf

tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
                          "Learning rate decays by this much.")
tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
                          "Clip gradients to this norm.")
tf.app.flags.DEFINE_integer("batch_size", 64,
                            "Batch size to use during training.")
tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.")
tf.app.flags.DEFINE_integer("en_vocab_size", 20000, "English vocabulary size.")
tf.app.flags.DEFINE_integer("pt_vocab_size", 20000, "Portuguese vocabulary size.")
tf.app.flags.DEFINE_integer("max_train_data_size", 0,
                            "Limit on the size of training data (0: no limit).")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
                            "How many training steps to do per checkpoint.")
tf.app.flags.DEFINE_boolean("decode", False,
                            "Set to True for interactive decoding.")
tf.app.flags.DEFINE_boolean("self_test", False,
                            "Run a self-test if this is set to True.")
FLAGS = tf.app.flags.FLAGS

# We use a number of buckets and pad to the closest one for efficiency.
# See seq2seq_model.Seq2SeqModel for details of how they work.
_buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]

In [7]:
from tensorflow.models.rnn.translate import seq2seq_model
import sys
import math
from six.moves import xrange
import numpy as np
import time

def read_data(source_path, target_path, max_size=None):
  data_set = [[] for _ in _buckets]
  with open(source_path) as source_file:
    with open(target_path) as target_file:
      source, target = source_file.readline(), target_file.readline()
      counter = 0
      while source and target and (not max_size or counter < max_size):
        counter += 1
        if counter % 100000 == 0:
          print("  reading data line %d" % counter)
          sys.stdout.flush()
        source_ids = [int(x) for x in source.split()]
        target_ids = [int(x) for x in target.split()]
        target_ids.append(EOS_ID)
        for bucket_id, (source_size, target_size) in enumerate(_buckets):
          if len(source_ids) < source_size and len(target_ids) < target_size:
            data_set[bucket_id].append([source_ids, target_ids])
            break
        source, target = source_file.readline(), target_file.readline()
  return data_set

def train():
  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    data = read_data("mapped.en", "mapped.pt", FLAGS.max_train_data_size)
    train_bucket_sizes = [len(data[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          data, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = "checkpoint/translate.ckpt"
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          if len(data[bucket_id]) == 0:
            print("  eval: empty bucket %d" % (bucket_id))
            continue
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              data, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float(
              "inf")
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
        sys.stdout.flush()

In [8]:
!mkdir checkpoint
train()

mkdir: checkpoint: File exists
Creating 3 layers of 512 units.
Created model with fresh parameters.
Reading development and training data (limit: 0).
  reading data line 100000
  reading data line 200000
  reading data line 300000
  reading data line 400000
  reading data line 500000
  reading data line 600000
  reading data line 700000
  reading data line 800000
  reading data line 900000
  reading data line 1000000
  reading data line 1100000
  reading data line 1200000
  reading data line 1300000
  reading data line 1400000
  reading data line 1500000
  reading data line 1600000
  reading data line 1700000
  reading data line 1800000
  reading data line 1900000
global step 200 learning rate 0.5000 step-time 5.10 perplexity 1916.38
  eval: bucket 0 perplexity 98.63
  eval: bucket 1 perplexity 243.48
  eval: bucket 2 perplexity 293.95
  eval: bucket 3 perplexity 312.79
global step 400 learning rate 0.5000 step-time 4.99 perplexity 329.47
  eval: bucket 0 perplexity 84.29
  eval: bucke

KeyboardInterrupt: 

In [8]:
def decode(sentence):
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enWords = []
    with open("vocab.en") as enVocabFile:
      enWords.extend(enVocabFile.readlines())
    enWords = [line.strip() for line in enWords]
    enVocab = dict([(x, y) for (y, x) in enumerate(enWords)])
    
    ptWords = []
    with open("vocab.pt") as ptVocabFile:
      ptWords.extend(ptVocabFile.readlines())
    ptWords = [line.strip() for line in ptWords]
    ptVocab = dict(enumerate(ptWords))

    # Get token-ids for the input sentence.
    tokens = word_tokenize(sentence, language="English")
    tokenIds = [enVocab.get(t, UNK_ID) for t in tokens]

    # Which bucket does it belong to?
    bucket_id = len(_buckets) - 1
    for i, bucket in enumerate(_buckets):
      if bucket[0] >= len(tokenIds):
        bucket_id = i
        break
    else:
      logging.warning("Sentence truncated: %s", sentence)

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(tokenIds, [])]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if EOS_ID in outputs:
      outputs = outputs[:outputs.index(EOS_ID)]
    # Print out Portuguese sentence corresponding to outputs.
    print(" ".join([tf.compat.as_str(ptVocab[output]) for output in outputs]))
    print("> ", end="")

In [1]:
import tensorflow as tf
tf.__version__

'0.11.0'

In [9]:
decode("My day is good, how about you?")

Reading model parameters from checkpoint/translate.ckpt-86200


ValueError: Restore called with invalid save path: 'checkpoint/translate.ckpt-86200'. File path is: 'checkpoint/translate.ckpt-86200'