# Chapter 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
import zipfile

url = 'http://www.manythings.org/anki/fra-eng.zip'
file = 'fra-eng.zip'
#urlretrieve(url, file)
# they don't like python user-agent - wants browser.

with zipfile.ZipFile(file, "r") as f:
    f.extractall('fra-eng')
    
!ls ./fra-eng/*

./fra-eng/_about.txt  ./fra-eng/fra.txt


In [2]:
import re

lines = []
with open('fra-eng/fra.txt', "r") as f:
    for line in f.readlines():
        g = re.search('(.*\t.*)\t.*', line)
        if g:
            lines.append(g.group(1))
            
max_samples = 50000
english_sentences = []
french_sentences = []

In [3]:
# Consider only the first 50 lines of the dataset
for i in range(max_samples):
	# Split each line into two at the tab character
    eng_fra_line = str(lines[i]).split('\t')
    
    # Separate out the English sentence 
    eng_line = eng_fra_line[0]
    
    # Append the start and end token to each French sentence
    fra_line = '\t' + eng_fra_line[1] + '\n'
    
    # Append the English and French sentence to the list of sentences
    english_sentences.append(eng_line)
    french_sentences.append(fra_line)

In [4]:
# Create an empty set to contain the English vocabulary 
english_vocab = set()

# Iterate over each English sentence
for eng_line in english_sentences:
  
    # Convert the English line to a set
    eng_line_set = set(eng_line)
    
    # Update English vocabulary with new characters from this line.
    english_vocab = english_vocab.union(eng_line_set)

# Sort the vocabulary
english_vocab = sorted(list(english_vocab))

In [5]:
# Create an empty set to contain the French vocabulary 
french_vocab = set()

# Iterate over each French sentence
for fra_line in french_sentences:
  
    # Convert the French line to a set
    fra_line_set = set(fra_line)
    
    # Update French vocabulary with new characters from this line.
    french_vocab = french_vocab.union(fra_line_set)

# Sort the vocabulary
french_vocab = sorted(list(french_vocab))

In [6]:
# Dictionary to contain the character to integer mapping for English
eng_char_to_idx = dict((char, idx) for idx, char in enumerate(english_vocab))

# Dictionary to contain the integer to character mapping for English
eng_idx_to_char = dict((idx, char) for idx, char in enumerate(english_vocab))

In [7]:
# Dictionary to contain the character to integer mapping for French
fra_char_to_idx = dict((char, idx) for idx, char in enumerate(french_vocab))

# Dictionary to contain the integer to character mapping for French
fra_idx_to_char = dict((idx, char) for idx, char in enumerate(french_vocab))

In [8]:
# Find the length of the longest English sentence
max_len_eng_sent = max([len(sentence) for sentence in english_sentences])

# Find the length of the longest French sentence
max_len_fra_sent = max([len(sentence) for sentence in french_sentences])

In [9]:
# Create a 3-D zero vector for the input English data
eng_input_data = np.zeros((len(english_sentences), max_len_eng_sent, len(english_vocab)), dtype='float32')

# Create a 3-D zero vector for the input French data
fra_input_data = np.zeros((len(french_sentences), max_len_fra_sent, len(french_vocab)), dtype='float32')

# Create the target vector
target_data = np.zeros((len(french_sentences), max_len_fra_sent, len(french_vocab)), dtype='float32')

In [10]:
# Iterate over the 50 sentences
for i in range(max_samples):
    # Iterate over each English character of each sentence
    for k, ch in enumerate(english_sentences[i]):
        # Convert the character to one-hot encoded vector
        eng_input_data[i, k, eng_char_to_idx[ch]] = 1
        
    # Iterate over each French character of each sentence
    for k, ch in enumerate(french_sentences[i]):
        # Convert the character to one-hot encoded vector
        fra_input_data[i, k, fra_char_to_idx[ch]] = 1.

        # Target data will be one timestep ahead and excludes start character
        if k > 0:
            target_data[i, k-1, fra_char_to_idx[ch]] = 1.

In [11]:
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import Model

Using TensorFlow backend.


In [12]:
# Create input layer
encoder_input = Input(shape=(None, len(english_vocab)))

# Create LSTM Layer of size 256
encoder_LSTM = LSTM(256, return_state = True)

# Save encoder output, hidden and cell state
encoder_outputs, encoder_h, encoder_c = encoder_LSTM(encoder_input)

# Save encoder states
encoder_states = [encoder_h, encoder_c]

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [13]:
# Create decoder input layer
decoder_input = Input(shape=(None, len(french_vocab)))

# Create LSTM layer of size 256
decoder_LSTM = LSTM(256, return_sequences=True, return_state = True)

# Save decoder output
decoder_out, decoder_h , decoder_c = decoder_LSTM(decoder_input, initial_state=encoder_states)

# Create a Dense layer with softmax activation
decoder_dense = Dense(len(french_vocab), activation='softmax')

# Save the decoder output
decoder_out = decoder_dense(decoder_out)

In [14]:
# Build model
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_out])

# Compile the model
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

# Print model summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 77)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 103)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 342016      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  368640      input_2[0][0]                    
                                                                 lstm_1[0][1]               

In [15]:
# Fit the model
model.fit(x=[eng_input_data, fra_input_data], y=target_data,
          		batch_size=64, epochs=1, validation_split=0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 40000 samples, validate on 10000 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f3bcba56128>

In [16]:
# Create encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Create decoder input states for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

In [18]:
# Create decoder output states for inference
decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]

# Create decoder dense layer
decoder_out = decoder_dense(decoder_out)
decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, outputs=[decoder_out] + decoder_states )

In [19]:
# Get encoder internal state by passing a sentence as input
inp_seq = eng_input_data[0:1]
states_val = encoder_model_inf.predict(inp_seq)

# Seed the first character and get output from the decoder 
target_seq = np.zeros((1, 1, len(french_vocab)))
target_seq[0, 0, fra_char_to_idx['\t']] = 1  
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

# Find out the next character from the Decoder output
max_val_index = np.argmax(decoder_out[0,-1,:])
sampled_fra_char = fra_idx_to_char[max_val_index]

# Print the first character predicted by the decoder
print(sampled_fra_char)

S


In [22]:
# Fill up target seq with the new char generated 
target_seq = np.zeros((1, 1, len(french_vocab)))
target_seq[0, 0, max_val_index] = 1

# Get decoder final states from last time
states_val = [decoder_h, decoder_c]

# Generate the next character
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

# Map the prediction to char and print it
max_val_index = np.argmax(decoder_out[0,-1,:])
sampled_fra_char = fra_idx_to_char[max_val_index]

print(sampled_fra_char)

o


In [24]:
def translated_eng_sentence(sent, french_vocab, decoder_h, decoder_c, decoder_model_inf, states_val, fra_idx_to_char):
    """ Given a sentence, translate to French """
    pass

In [25]:
# Generate 10 French sentences from inp_seq
for seq_index in range(10):
  
    # Get next encoded english sentence
    inp_seq = eng_input_data[seq_index:seq_index+1]
    
    # Get the translated sentence
    translated_sent = translated_eng_sentence(inp_seq)
    
    # Print the original English sentence
    print('English sentence:', english_sentences[seq_index])
    
    # Print the translated French sentence
    print('French sentence:', translated_sent)

English sentence: Go.
French sentence: None
English sentence: Go.
French sentence: None
English sentence: Go.
French sentence: None
English sentence: Hi.
French sentence: None
English sentence: Hi.
French sentence: None
English sentence: Run!
French sentence: None
English sentence: Run!
French sentence: None
English sentence: Run!
French sentence: None
English sentence: Run!
French sentence: None
English sentence: Run!
French sentence: None
