In [2]:
import os
import os.path as path
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from engspa import prep

In [3]:
DATAROOT = path.expanduser("~/mldata/spa-eng")

In [4]:
trainfile = path.join(DATAROOT, "train.txt")

In [5]:
eng_text = []
spa_text = []
with open(trainfile, "rt", encoding="utf-8") as f:
    for line in f:
        eng, spa = line.split("\t")
        eng = prep(eng)
        spa = prep(spa)
        eng_text.append(eng)
        spa_text.append(spa)

In [6]:
print(eng_text[0], " => ", spa_text[0])

<start> that bicycle is mine . <end>  =>  <start> aquella bicicleta es mia . <end>


In [7]:
tokenizer = tfds.features.text.Tokenizer(reserved_tokens=["<start>", "<end>"])
tokenizer.tokenize(eng_text[0])

['<start>', 'that', 'bicycle', 'is', 'mine', '<end>']

In [8]:
tokenizer.tokenize(spa_text[0])

['<start>', 'aquella', 'bicicleta', 'es', 'mia', '<end>']

In [10]:
eng_vocab = set()
for eng in eng_text:
    tokens = tokenizer.tokenize(eng)
    eng_vocab.update(tokens)

spa_vocab = set()
for spa in spa_text:
    tokens = tokenizer.tokenize(spa)
    spa_vocab.update(tokens)

In [11]:
print(len(eng_vocab), len(spa_vocab))

12172 23014


In [12]:
eng_encoder = tfds.features.text.TokenTextEncoder(eng_vocab, tokenizer=tokenizer)
spa_encoder = tfds.features.text.TokenTextEncoder(spa_vocab, tokenizer=tokenizer)