# Home 4-Bonus: Build a seq2seq multi-model for machine translation.

### Name: [Bowen Li]

### Task: Translate English to [French] & [Spanish]

## Data preparation



In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
link = "https://drive.google.com/open?id=1IoPXyl5agtVz_usngPB62cxZ3mbiF0by"
fluff, id = link.split('=')
print (id)
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('fra.txt') 

1IoPXyl5agtVz_usngPB62cxZ3mbiF0by


In [3]:
link = "https://drive.google.com/open?id=1EwNpFp-CUtku4Y1EjQTGfEYwvj4G1fOF"
fluff, id = link.split('=')
print (id)
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('spa.txt')  

1EwNpFp-CUtku4Y1EjQTGfEYwvj4G1fOF


In [0]:
import re
import string
from unicodedata import normalize
import numpy
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs
def clean_data(lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [word.lower() for word in line]
            line = [word.translate(table) for word in line]
            line = [re_print.sub('', w) for w in line]
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [0]:
filename = "fra.txt"
filename_spa = "spa.txt"
n_train = 20000

doc = load_doc(filename)
doc_spa = load_doc(filename_spa)
pairs = to_pairs(doc)
pairs_spa = to_pairs(doc_spa)
clean_pairs = clean_data(pairs)[0:n_train, :]
clean_pairs_spa = clean_data(pairs_spa)[0:n_train, :]

In [6]:
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]
target_texts_spa = ['\t' + text + '\n' for text in clean_pairs_spa[:, 1]]

print('Length of input_texts:  ' + str(input_texts.shape))

Length of input_texts:  (20000,)


In [7]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)
max_decoder_spa_seq_length = max(len(line) for line in target_texts_spa)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_spa_seq_length))

max length of input  sentences: 17
max length of target sentences: 56
max length of target sentences: 68


## Text processing

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, 
                                                       target_texts)
decoder_input_seq_spa, target_token_index_spa = text2sequences(max_decoder_spa_seq_length, 
                                                       target_texts_spa)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (20000, 17)
shape of input_token_index: 27
shape of decoder_input_seq: (20000, 56)
shape of target_token_index: 29


In [10]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1
num_decoder_tokens_spa = len(target_token_index_spa) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


## One-Hot Ecoding

In [11]:
from keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
decoder_input_data_spa = onehot_encode(decoder_input_seq_spa, max_decoder_spa_seq_length, num_decoder_tokens_spa)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq, 
                                    max_decoder_seq_length, 
                                    num_decoder_tokens)

decoder_target_seq_spa = numpy.zeros(decoder_input_seq_spa.shape)
decoder_target_seq_spa[:, 0:-1] = decoder_input_seq_spa[:, 1:]
decoder_target_data_spa = onehot_encode(decoder_target_seq_spa, 
                                    max_decoder_spa_seq_length, 
                                    num_decoder_tokens_spa)

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(20000, 17, 28)
(20000, 56, 30)
(20000, 56, 30)


In [12]:
import numpy as np
np.random.seed(1000)
rand_indices = numpy.random.permutation(20000)
train_idxs = rand_indices[0:16000]
test_idxs = rand_indices[16000:20000]

testing_encoder = encoder_input_data[test_idxs, :]

training_encoder = encoder_input_data[train_idxs, :]
training_input_decoder = decoder_input_data[train_idxs, :]
training_target_decoder = decoder_target_data[train_idxs, :]

training_input_decoder_spa = decoder_input_data_spa[train_idxs, :]
training_target_decoder_spa = decoder_target_data_spa[train_idxs, :]


len(training_target_decoder)

16000

## Buid the networks

In [0]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers import Bidirectional, Concatenate

latent_dim = 256

encoder_inputs = Input(shape=(None, num_encoder_tokens))
# encoder = LSTM(latent_dim, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# encoder_states = [state_h, state_c]

encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
                                  dropout=0.5, name='encoder_lstm'))
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(2*latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

decoder_inputs_spa = Input(shape=(None, num_decoder_tokens_spa))
decoder_lstm_spa = LSTM(2*latent_dim, return_sequences=True, return_state=True)
decoder_outputs_spa, _, _ = decoder_lstm(decoder_inputs_spa, initial_state=encoder_states)
decoder_dense_spa = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs_spa = decoder_dense(decoder_outputs_spa)

model = Model([encoder_inputs, decoder_inputs, decoder_inputs_spa], [decoder_outputs, decoder_outputs_spa])

In [16]:
batch_size = 128
epochs = 30

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
hs = model.fit([encoder_input_data, decoder_input_data, decoder_input_data_spa], [decoder_target_data, decoder_target_data_spa],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.2)

Train on 16000 samples, validate on 4000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Make predictions

In [0]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(2*latent_dim,))
decoder_state_input_c = Input(shape=(2*latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [0]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [0]:
def decode_sequence(input_seq):
    
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [23]:
from nltk.translate.bleu_score import sentence_bleu
sc = []
for idx in test_idxs[:200]:
    input_seq = encoder_input_data[idx:idx+1]
    decoded_sentence = decode_sequence([input_seq])
    sc.append(sentence_bleu(target_texts[idx][1:-1], decoded_sentence[0:-1]))

sum(sc)/(len(sc)*1.0)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.8061089539185988

In [25]:
print(target_texts)
print(target_texts_spa)

['\tva\n', '\tsalut\n', '\tcours\n', '\tcourez\n', '\tqui\n', '\tca alors\n', '\tau feu\n', '\ta laide\n', '\tsaute\n', '\tca suffit\n', '\tstop\n', '\tarretetoi\n', '\tattends\n', '\tattendez\n', '\tpoursuis\n', '\tcontinuez\n', '\tpoursuivez\n', '\tbonjour\n', '\tsalut\n', '\tje comprends\n', '\tjessaye\n', '\tjai gagne\n', '\tje lai emporte\n', '\tjai gagne\n', '\toh non\n', '\tattaque\n', '\tattaquez\n', '\tsante\n', '\ta votre sante\n', '\tmerci\n', '\ttchintchin\n', '\tlevetoi\n', '\tva maintenant\n', '\tallezy maintenant\n', '\tvasy maintenant\n', '\tjai pige\n', '\tcompris\n', '\tpige\n', '\tcompris\n', '\ttas capte\n', '\tmonte\n', '\tmontez\n', '\tserremoi dans tes bras\n', '\tserrezmoi dans vos bras\n', '\tje suis tombee\n', '\tje suis tombe\n', '\tje sais\n', '\tje suis parti\n', '\tje suis partie\n', '\tjai perdu\n', '\tjai paye\n', '\tjai ans\n', '\tje vais bien\n', '\tca va\n', '\tecoutez\n', '\tcest pas possible\n', '\timpossible\n', '\ten aucun cas\n', '\tsans facons\n