<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/04a_NLP_and_sequence_to_sequence_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP and Sequence-to-Sequence RNNs

In [0]:
!wget https://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip

In [0]:
from io import open

import unicodedata
import string
import re
import random

In [0]:
def unicodeToAscii(s):
    return ''.join(
        char for char in unicodedata.normalize('NFD', s)
        if unicodedata.category(char) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [0]:
def readPairs(pathToFile, slang, tlang):
    print("Reading lines...")

    f = open(pathToFile, encoding='utf-8')
    lines = f.read().strip().split('\n')

    pairs = [[normalizeString(s) for s in line.split('\t')[:2]] for line in lines]

    return pairs

In [0]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [0]:
def prepareData(pathToFile, slang, tlang):
    pairs = readPairs(pathToFile, slang, tlang)
    print("Read {} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {} sentence pairs".format(len(pairs)))
    return pairs

In [0]:
def datasetIndexes(pairs, train=0.8, val=0.1):
    num_examples = len(pairs)
    indexes = list(range(num_examples))
    last_train_idx = round(train*num_examples)
    last_valid_idx = last_train_idx + round(val*num_examples)
    random.shuffle(indexes)
    train_idxs = indexes[:last_train_idx]
    val_idxs = indexes[last_train_idx:last_valid_idx]
    test_idxs = indexes[last_valid_idx:]
    return train_idxs, val_idxs, test_idxs


In [0]:
pairs = prepareData('spa.txt', 'eng', 'spa')
train_idxs, val_idxs, test_idxs = datasetIndexes(pairs)
print("{} {} {}".format(len(train_idxs), len(val_idxs), len(test_idxs) ))
print(random.choice(pairs))

Reading lines...
Read 123335 sentence pairs
Trimmed to 7588 sentence pairs
6070 759 759
['i m looking for someone .', 'estoy buscando a alguien .']
