### Ingest the data

In [1]:
import helper

source_path = 'data/small_vocab_en'
target_path = 'data/small_vocab_fr'
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)

In [2]:
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))

view_sentence_range = (0,10)
sentences = source_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))

print()
print('English sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('French sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 227
Number of sentences: 137861
Average number of words in a sentence: 13.225277634719028

English sentences 0 to 10:
new jersey is sometimes quiet during autumn , and it is snowy in april .
the united states is usually chilly during july , and it is usually freezing in november .
california is usually quiet during march , and it is usually hot in june .
the united states is sometimes mild during june , and it is cold in september .
your least liked fruit is the grape , but my least liked is the apple .
his favorite fruit is the orange , but my favorite is the grape .
paris is relaxing during december , but it is usually chilly in july .
new jersey is busy during spring , and it is never hot in march .
our least liked fruit is the lemon , but my least liked is the grape .
the united states is sometimes busy during january , and it is sometimes warm in november .

French sentences 0 to 10:
new jersey est parfois calme pendant l' automne 

### Implement Preprocessing Function
Turn the text into a number (i.e. each word into a unique id) so that the computer can process it. Also, you need to add the `<EOS>` word id at the end of each sentence in the target text. This is needed so that the neural network can predict when the sentence(s) should end.

In [12]:
def text_to_ids(source, target, source_vocab_to_int, target_vocab_to_int):
    """
    Convert source and target text to proper word ids
    :param source_text: String that contains all the source text.
    :param target_text: String that contains all the target text.
    :param source_vocab_to_int: Dictionary to go from the source words to an id
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: A tuple of lists (source_id_text, target_id_text)
    """
    # TODO: Implement Function
    eos = target_vocab_to_int['<EOS>']
    
    source_id_text = []
    max_sentences = 5000

    counter = 0    
    for sentence1 in source_text.split('\n'):
        temp = []
        counter += 1
        if(counter >= max_sentences):
            break
        
        for word in sentence1.split(' '):
            if(word in source_vocab_to_int):
                temp.append(source_vocab_to_int[word])
            else:
                print("could not find in source: ",word)
        source_id_text.append(temp)

    counter = 0
    target_id_text = []
    for sentence2 in target_text.split('\n'):
        temp = []
        counter += 1
        if(counter >= max_sentences):
            break

        for word in sentence2.split(' '):
            if(word in target_vocab_to_int):
                temp.append(target_vocab_to_int[word])
            else:
                print("could not find in target: ",word)
        temp.append(eos)
        target_id_text.append(temp)
        
    return source_id_text, target_id_text

### Running the below will pre-process all the data and save it to a file.

In [13]:
helper.preprocess_and_save_data(source_path, target_path, text_to_ids)

could not find in source:  
could not find in target:  États-unis
could not find in target:  États-unis
could not find in target:  États-unis
