First we must download the data!

In [2]:
import requests
from pathlib import Path

dataURL = "http://www.statmt.org/europarl/v7/pt-en.tgz"
fileName = "pt-en.tgz"
tarFile = Path(fileName)
if tarFile.is_file():
    print("Already downloaded")
else:
    r = requests.get(dataURL)
    with open(fileName, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)

Already downloaded


Next we must extract the data!

In [3]:
import tarfile

enFile = Path("europarl-v7.pt-en.en")
ptFile = Path("europarl-v7.pt-en.pt")
if enFile.is_file() and ptFile.is_file():
    print("Already extracted")
else:
    with tarfile.open(fileName) as tar:
        tar.extractall()

Already extracted


Next we want to create our vocabularies in both languages. Lets go with something small to begin with, how about 20,000 words?

In [None]:
import nltk
nltk.download() # d all (I really just want the tokenizer's .pickle for both languages, but I don't know where it is so just download everything)

In [4]:
from nltk.tokenize import word_tokenize

def tokenize(sentance, language):
    tokens = word_tokenize(sentance.decode('utf-8'), language=language)
    return [word.lower() for word in tokens if word.isalpha()]

In [5]:
from nltk.tokenize import word_tokenize
# Special vocabulary symbols - we always put them at the start.
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
startVocab = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

def createVocabulary(corpusFile, vocabFile, numWords, language):
    vocab = {}
    with open(corpusFile) as corpus:
      for line in corpus:
        words = tokenize(sentance, language)
        for word in words:
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      orderedWords = startVocab + sorted(vocab, key=vocab.get, reverse=True)
      if len(orderedWords) > numWords:
        orderedWords = orderedWords[:numWords]
      with open(vocabFile, 'wu') as file:
        for word in orderedWords:
          file.write(word.encode('utf-8') + "\n")
        
enVocab = Path("vocab3.en")
ptVocab = Path("vocab3.pt")

if enVocab.is_file():
    print("Already did English")
else:
    print("Starting English vocab")
    createVocabulary("europarl-v7.pt-en.en", "vocab3.en", 40000, "english")
    print("Finished English vocab")
if ptVocab.is_file():
    print("Already did Portuguese")
else:
    print("Starting Portuguese vocab")
    createVocabulary("europarl-v7.pt-en.pt", "vocab3.pt", 40000, "portuguese")
    print("Finished Portuguese vocab")

Already did English
Already did Portuguese


In [6]:
def mapCorpus(corpusFile, vocabFile, mappedFile, language):
    words = []
    with open(vocabFile) as v:
      words.extend(v.readlines())
    words = [line.strip() for line in words]
    vocab = dict([(x, y) for (y, x) in enumerate(words)])
    with open(corpusFile) as corpus:
        with open(mappedFile, "w") as output:
            for line in corpus:
                tokens = word_tokenize(line.decode('utf-8'), language=language)
                words = [word.lower() for word in tokens if word.isalpha()]
                ids = [str(vocab.get(word, UNK_ID)) for word in words]
                output.write(" ".join(ids) + "\n")

enVocab = Path("mapped3.en")
ptVocab = Path("mapped3.pt")

if enVocab.is_file():
    print("Already did English")
else:
    print("Starting English map")
    mapCorpus("europarl-v7.pt-en.en", "vocab3.en", "mapped3.en", "english")
    
if ptVocab.is_file():
    print("Already did Portuguese")
else:
    print("Starting Portuguese map")
    mapCorpus("europarl-v7.pt-en.pt", "vocab3.pt", "mapped3.pt", "portuguese")

Already did English
Already did Portuguese


In [7]:
def splitIntoDevAndTraining(mappedFile, numberOfDevExamples, devOut, trainOut):
    with open(mappedFile) as mapped:
        with open(devOut, "w") as dev:
            with open(trainOut, "w") as train:
                count = 0
                for line in mapped.readlines():
                    count += 1
                    if count < numberOfDevExamples:
                        dev.write(line)
                    else:
                        train.write(line)

enDev = Path("dev3.en")
ptDev = Path("dev3.pt")
enTrain = Path("train3.en")
ptTrain = Path("train3.pt")

if enDev.is_file() and enTrain.is_file():
    print("Already did English")
else:
    print("Starting English split")
    splitIntoDevAndTraining("mapped3.en", 2000, "dev3.en", "train3.en")
    
if ptDev.is_file() and ptTrain.is_file():
    print("Already did Portuguese")
else:
    print("Starting Portuguese split")
    splitIntoDevAndTraining("mapped3.pt", 2000, "dev3.pt", "train3.pt")

Already did English
Already did Portuguese


In [8]:
def create_model(session, forward_only):
  """Create translation model and initialize or load parameters in session."""
  dtype = tf.float32
  model = seq2seq_model.Seq2SeqModel(
      FLAGS.en_vocab_size,
      FLAGS.pt_vocab_size,
      _buckets,
      FLAGS.size,
      FLAGS.num_layers,
      FLAGS.max_gradient_norm,
      FLAGS.batch_size,
      FLAGS.learning_rate,
      FLAGS.learning_rate_decay_factor,
      forward_only=forward_only,
      dtype=dtype)
  ckpt = tf.train.get_checkpoint_state("checkpoint4")
  if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
    model.saver.restore(session, ckpt.model_checkpoint_path)
  else:
    print("Created model with fresh parameters.")
    session.run(tf.global_variables_initializer())
  return model

In [9]:
import tensorflow as tf

tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
                          "Learning rate decays by this much.")
tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
                          "Clip gradients to this norm.")
tf.app.flags.DEFINE_integer("batch_size", 64,
                            "Batch size to use during training.")
tf.app.flags.DEFINE_integer("size", 1024, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.")
tf.app.flags.DEFINE_integer("en_vocab_size", 40000, "English vocabulary size.")
tf.app.flags.DEFINE_integer("pt_vocab_size", 40000, "Portuguese vocabulary size.")
tf.app.flags.DEFINE_integer("max_train_data_size", 0,
                            "Limit on the size of training data (0: no limit).")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
                            "How many training steps to do per checkpoint.")
tf.app.flags.DEFINE_boolean("decode", False,
                            "Set to True for interactive decoding.")
tf.app.flags.DEFINE_boolean("self_test", False,
                            "Run a self-test if this is set to True.")
FLAGS = tf.app.flags.FLAGS

# We use a number of buckets and pad to the closest one for efficiency.
# See seq2seq_model.Seq2SeqModel for details of how they work.
_buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]

In [10]:
from tensorflow.models.rnn.translate import seq2seq_model
import sys
import math
from six.moves import xrange
import numpy as np
import time

def read_data(source_path, target_path, max_size=None):
  data_set = [[] for _ in _buckets]
  with open(source_path) as source_file:
    with open(target_path) as target_file:
      source, target = source_file.readline(), target_file.readline()
      counter = 0
      while source and target and (not max_size or counter < max_size):
        counter += 1
        if counter % 100000 == 0:
          print("  reading data line %d" % counter)
          sys.stdout.flush()
        source_ids = [int(x) for x in source.split()]
        target_ids = [int(x) for x in target.split()]
        target_ids.append(EOS_ID)
        for bucket_id, (source_size, target_size) in enumerate(_buckets):
          if len(source_ids) < source_size and len(target_ids) < target_size:
            data_set[bucket_id].append([source_ids, target_ids])
            break
        source, target = source_file.readline(), target_file.readline()
  return data_set

def train():
  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)
    summary_writer = tf.train.FileWriter('checkpoint4', graph=sess.graph)


    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_data = read_data("dev3.en", "dev3.pt", 0)
    train_data = read_data("train3.en", "train3.pt", FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_data[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_data, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
#         summary_writer.
        # Print statistics for the previous epoch.
        perplexity = math.exp(float(loss)) if loss < 300 else float("inf")

        print("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        stats = open("stats.txt", "w")
        stats.write("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = "checkpoint4/translate.ckpt"
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          if len(dev_data[bucket_id]) == 0:
            print("  eval: empty bucket %d" % (bucket_id))
            continue
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              dev_data, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float(
              "inf")
          stats.write("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
        stats.close()
      sys.stdout.flush()

In [None]:
train()

Creating 3 layers of 1024 units.
Instructions for updating:
Please use tf.global_variables instead.
Reading model parameters from checkpoint4/translate.ckpt-172200
Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
Reading development and training data (limit: 0).
  reading data line 100000
  reading data line 200000
  reading data line 300000
  reading data line 400000
  reading data line 500000
  reading data line 600000
  reading data line 700000
  reading data line 800000
  reading data line 900000
  reading data line 1000000
  reading data line 1100000
  reading data line 1200000
  reading data line 1300000
  reading data line 1400000
  reading data line 1500000
  reading data line 1600000
  reading data line 1700000
  reading data line 1800000
  reading data line 1900000
global step 172400 learning rate 0.1012 step-time 0.98 perplexity 3.17
  eval: bucket 0 perplexity 1.53
  eval: bucket 1 perplexity 3

In [13]:
def decode(sentances):
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enWords = []
    with open("vocab3.en") as enVocabFile:
        enWords.extend(enVocabFile.readlines())
    enWords = [line.strip() for line in enWords]
    enVocab = dict([(x, y) for (y, x) in enumerate(enWords)])
    
    ptWords = []
    with open("vocab3.pt") as ptVocabFile:
        ptWords.extend(ptVocabFile.readlines())
    ptWords = [line.strip() for line in ptWords]
    ptVocab = dict(enumerate(ptWords))
    print("Starting translation")
    for sentance in sentances:
                # Get token-ids for the input sentence.
                tokens = tokenize(sentance, "english")
                tokenIds = [enVocab.get(t, UNK_ID) for t in tokens]
                if len(tokenIds) > _buckets[len(_buckets) - 1][0]:
                    continue
                # Which bucket does it belong to?
                bucket_id = len(_buckets) - 1
                for i, bucket in enumerate(_buckets):
                    if bucket[0] >= len(tokenIds):
                        bucket_id = i
                        break

                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(tokenIds, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                 target_weights, bucket_id, True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
                # If there is an EOS symbol in outputs, cut them at that point.
                if EOS_ID in outputs:
                    outputs = outputs[:outputs.index(EOS_ID)]
                # Print out Portuguese sentence corresponding to outputs.
                print(sentance)
                print(tokens)
                print(" ".join([tf.compat.as_str(ptVocab[output]) for output in outputs]))

In [14]:
decode(["Competition between the regions will certainly strengthen rather than weaken the European Union.",
        "It was this which inspired us to propose the same thing with regard to state aid.",
       "During the course of our work on this report it became clear that there are persistent problems in the spending areas under the control of our budget.",
       "If the establishment of such an area, in which the Union can also intervene in the basic rights of the citizens, is only decided by diplomats and bureaucrats, while the elected representatives of Europe are reduced to following developments like a rabbit watching a snake, then this area will not gain the acceptance of the citizens.",
        "All asylum-seekers must be entitled to a fair hearing and an appeal with suspensory effect.",
        "Nevertheless, to avoid the customary circus, which every six months has us indiscriminately attacking the Presidency of the Council, maybe Parliament should find the courage to undertake incisive political action so that the next Intergovernmental Conference will decide to increase codecision with immediate effect and not wait for another five years.",
        "But I want to add a word of caution.",
        "All of this must nevertheless be done whilst respecting a broad framework - that of the overall acceptability of any solutions we may devise to achieve these three aims.",
        "We are looking for solutions.",
        "Madam President, on a point of order.",
        "I declare the session of the European Parliament adjourned."])

Competition between the regions will certainly strengthen rather than weaken the European Union.
[u'competition', u'between', u'the', u'regions', u'will', u'certainly', u'strengthen', u'rather', u'than', u'weaken', u'the', u'european', u'union']
It was this which inspired us to propose the same thing with regard to state aid.
[u'it', u'was', u'this', u'which', u'inspired', u'us', u'to', u'propose', u'the', u'same', u'thing', u'with', u'regard', u'to', u'state', u'aid']
During the course of our work on this report it became clear that there are persistent problems in the spending areas under the control of our budget.
[u'during', u'the', u'course', u'of', u'our', u'work', u'on', u'this', u'report', u'it', u'became', u'clear', u'that', u'there', u'are', u'persistent', u'problems', u'in', u'the', u'spending', u'areas', u'under', u'the', u'control', u'of', u'our', u'budget']
If the establishment of such an area, in which the Union can also intervene in the basic rights of the citizens, is 