Author: François Mercier

Goals: 
- Convert preprocessed into TF dataloader

# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import pickle
from functools import partial

from fastprogress import progress_bar

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.__version__

'2.0.0'

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [5]:
import sys
sys.path.append("..") # Require to have the utilities packages in path
from tools import tokenizer

In [6]:
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_rows', 999)

In [7]:
data_path = Path(r"/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020")
files = list(data_path.glob("*"))
files

[PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/token_to_word_en.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/train_lang1_en_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/word_to_token_fr.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/unaligned_fr_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/word_to_token_en.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/train_lang2_fr_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/token_to_word_fr.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/unaligned_en_numericalized.pickle')]

# Bilingual dataloader

In [8]:
with open(data_path/"train_lang1_en_numericalized.pickle", 'rb') as handle:
    train_lang1_en_numericalized = pickle.load(handle)
    
with open(data_path/"train_lang2_fr_numericalized.pickle", 'rb') as handle:
    train_lang2_fr_numericalized = pickle.load(handle)
    
with open(data_path/"word_to_token_en.pickle", 'rb') as handle:
    word_to_token_en = pickle.load(handle)
    
with open(data_path/"word_to_token_fr.pickle", 'rb') as handle:
    word_to_token_fr = pickle.load(handle)

with open(data_path/"token_to_word_fr.pickle", 'rb') as handle:
    token_to_word_fr = pickle.load(handle)

with open(data_path/"token_to_word_en.pickle", 'rb') as handle:
    token_to_word_en = pickle.load(handle)
    

In [9]:
gen_ds = zip(train_lang1_en_numericalized, train_lang2_fr_numericalized)

In [10]:
def my_generator(train_lang1_en_numericalized=train_lang1_en_numericalized, 
                 train_lang2_fr_numericalized=train_lang2_fr_numericalized,
                ):
    bos, eos = -2, -1
    for i in range(len(train_lang1_en_numericalized)):
        en = np.array([bos] + train_lang1_en_numericalized[i] + [eos]) + 3
        fr = np.array([bos] + train_lang2_fr_numericalized[i] + [eos]) + 3
        inputs = (en, 
                  fr)
        output = fr[1:]
        yield (inputs, output)

In [11]:
batch_size = 16 # For 2 K80
valid_size = 1000

ds = tf.data.Dataset.from_generator(my_generator, 
                                    output_types=((tf.int32, tf.int32), tf.int32), 
                                    output_shapes=((tf.TensorShape([None]), tf.TensorShape([None])), 
                                                   tf.TensorShape([None])))
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
ds = ds.shuffle(seed=42, buffer_size=256)
#ds = ds.map(lambda x, y: ((tf.minimum(x[0], 10000 - 1), tf.minimum([1], 10000 - 1)), tf.minimum(y, 10000 - 1))) # Only to test performance with lower vocab size (and GPU mem)
ds = ds.padded_batch(batch_size=batch_size, padded_shapes=(([128], [128]), 128))

# 5000 like XNLI https://www.nyu.edu/projects/bowman/xnli/
test_dataset = ds.take(int(valid_size / batch_size))
train_dataset = ds.skip(int(valid_size / batch_size))


In [12]:
%%time
for element in test_dataset.take(1): 
    print(element[0][0].shape, element[0][1].shape, element[1].shape)

(16, 128) (16, 128) (16, 128)
CPU times: user 67.4 ms, sys: 16.9 ms, total: 84.2 ms
Wall time: 166 ms


In [13]:
len(word_to_token_fr)

91269

# Monolingual dataloader

In [14]:
with open(data_path/"unaligned_fr_numericalized.pickle", 'rb') as handle:
    unaligned_fr_numericalized = pickle.load(handle)
    
with open(data_path/"unaligned_en_numericalized.pickle", 'rb') as handle:
    unaligned_en_numericalized = pickle.load(handle)

In [15]:
def my_generator_monolingual(monolingual_numericalized):
    bos, eos = -2, -1
    for i in range(len(train_lang1_en_numericalized)):
        inputs = np.array([bos] + monolingual_numericalized[i] + [eos]) + 3
        output = inputs[1:]
        yield (inputs, output)

In [16]:
batch_size_monolingual = 16 # For 2 K80
valid_size_monolingual = 1000

ds_monolingual_fr = tf.data.Dataset.from_generator(partial(my_generator_monolingual, monolingual_numericalized=unaligned_fr_numericalized), 
                                    output_types=(tf.int32, tf.int32), 
                                    output_shapes=(tf.TensorShape([None]), 
                                                   tf.TensorShape([None])))
ds_monolingual_fr = ds_monolingual_fr.prefetch(tf.data.experimental.AUTOTUNE)
ds_monolingual_fr = ds_monolingual_fr.shuffle(seed=42, buffer_size=256)
#ds = ds_monolingual_fr.map(lambda x, y: ((tf.minimum(x[0], 10000 - 1), tf.minimum([1], 10000 - 1)), tf.minimum(y, 10000 - 1))) # Only to test performance with lower vocab size (and GPU mem)
ds_monolingual_fr = ds_monolingual_fr.padded_batch(batch_size=batch_size_monolingual, padded_shapes=([128], 128))

# 5000 like XNLI https://www.nyu.edu/projects/bowman/xnli/
test_dataset_monolingual_fr = ds_monolingual_fr.take(int(valid_size_monolingual / batch_size))#.cache()
train_dataset_monolingual_fr = ds_monolingual_fr.skip(int(valid_size_monolingual / batch_size))#.cache()


In [17]:
%%time
for element in train_dataset_monolingual_fr.take(1): 
    print(element[0].shape, element[1].shape)

(16, 128) (16, 128)
CPU times: user 183 ms, sys: 13 ms, total: 196 ms
Wall time: 197 ms


# Seq2Seq at word level

In [18]:
# hparams
latent_dim = 256
embedding_dim = 200

max_len = 128

vocab_size_en = len(word_to_token_en) + 3
vocab_size_fr = len(word_to_token_fr) + 3
#vocab_size_en = 10000
#vocab_size_fr = 10000



# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_len), name="encoder_input")
encoder_embeddings = tf.keras.layers.Embedding(vocab_size_en, embedding_dim, mask_zero=True, name="encoder_embedding")
encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings(encoder_inputs))
encoder_states = [state_h, state_c]

encoder_dense = tf.keras.layers.Dense(vocab_size_en, activation='softmax', name="encoder_dense")
encoder_time_distributed = tf.keras.layers.TimeDistributed(encoder_dense, name="encoder_time_distributed")
encoder_outputs = encoder_time_distributed(encoder_outputs)

# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_len), name="decoder_input")

decoder_embeddings = tf.keras.layers.Embedding(vocab_size_fr, embedding_dim, mask_zero=True, name="decoder_embedding")
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings(decoder_inputs),
                                     initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(vocab_size_fr, activation='softmax', name="decoder_dense")
decoder_time_distributed = tf.keras.layers.TimeDistributed(decoder_dense, name="decoder_time_distributed")
decoder_outputs = decoder_time_distributed(decoder_outputs)


decoder_state_input_h = tf.keras.layers.Input(shape=(latent_dim,), name="decoder_input_h")
decoder_state_input_c = tf.keras.layers.Input(shape=(latent_dim,), name="decoder_input_c")

# Decoder for inference (no states from encoder)
decoder_outputs_inference, _, _ = decoder_lstm(decoder_embeddings(decoder_inputs))
decoder_outputs_inference = decoder_time_distributed(decoder_outputs_inference)

# Multi GPU settings
nb_gpus = len(tf.config.experimental.list_physical_devices('GPU'))
mirrored_strategy = tf.distribute.MirroredStrategy(["/gpu:" + str(i) for i in range(min(2, nb_gpus))])

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
if mirrored_strategy is not None and mirrored_strategy.num_replicas_in_sync > 1:
    with mirrored_strategy.scope():
        model_monolingual_fr = tf.keras.Model(encoder_inputs, encoder_outputs, name="Encoder")
        model_monolingual_en = tf.keras.Model(decoder_inputs, decoder_outputs_inference, name="Decoder")
        model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="Full_Model")
else:
    model_monolingual_fr = tf.keras.Model(encoder_inputs, encoder_outputs, name="Encoder")
    model_monolingual_en = tf.keras.Model(decoder_inputs, decoder_outputs_inference, name="Decoder")
    model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="Full_Model")

model_monolingual_fr.summary()
model_monolingual_en.summary()
model.summary()

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, 128)]             0         
_________________________________________________________________
encoder_embedding (Embedding (None, 128, 200)          12091400  
_________________________________________________________________
encoder_lstm (LSTM)          [(None, 128, 256), (None, 467968    
_________________________________________________________________
encoder_time_distributed (Ti (None, 128, 60457)        15537449  
Total params: 28,096,817
Trainable params: 28,096,817
Non-trainable params: 0
_________________________________________________________________
Model: "Decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder_input (InputLayer)   [(None, 128)]             0         
______________________________

In [19]:
model_not_pretrained = tf.keras.models.clone_model(model)

# Training

In [20]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

In [21]:
# Run training
model_not_pretrained.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


model_not_pretrained.fit(train_dataset.take(30), 
                         validation_data=test_dataset,
                         validation_steps=int(valid_size / batch_size), 
                         callbacks=[callback], 
                         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ab15466e450>

In [22]:
# Run languague model training for encoder
model_monolingual_fr.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


model_monolingual_fr.fit(train_dataset_monolingual_fr.take(300), 
                         validation_data=test_dataset_monolingual_fr, 
                         validation_steps=int(valid_size_monolingual / batch_size_monolingual), 
                         callbacks=[callback], 
                         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x2ab16f5fd950>

In [23]:
# Run training
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


model.fit(train_dataset.take(30), 
          validation_data=test_dataset, 
          validation_steps=int(valid_size / batch_size), 
          callbacks=[callback], 
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ab17ad23950>

# BLEU score

In [60]:
# For an unknown reason, for the predict, inputs must be float (unlike train)
x, y = next(test_dataset.take(1).__iter__())
preds = model.predict(x)

token_to_word_en_with_special_tokens = {(k+3): v for k, v in token_to_word_en.items()}
token_to_word_en_with_special_tokens[0] = "<MASK>"
token_to_word_en_with_special_tokens[1] = "<BOS>"
token_to_word_en_with_special_tokens[2] = "<EOS>"


english = []
for sen in x[0]:
    trunc_sen = []
    for i in sen.numpy():
        if i in (0, 1): # MASKING or BOS
            continue
        if i == 2: # EOS
            break
        trunc_sen += [token_to_word_en_with_special_tokens[i]]
    english += [" ".join(trunc_sen)]
    

token_to_word_fr_with_special_tokens = {(k+3): v for k, v in token_to_word_fr.items()}
token_to_word_fr_with_special_tokens[0] = "<MASK>"
token_to_word_fr_with_special_tokens[1] = "<BOS>"
token_to_word_fr_with_special_tokens[2] = "<EOS>"


refs = []
for sen in y:
    trunc_sen = []
    for i in sen.numpy():
        if i in (0, 1): # MASKING or BOS
            continue
        if i == 2: # EOS
            break
        trunc_sen += [token_to_word_fr_with_special_tokens[i]]
    refs += [" ".join(trunc_sen)]


sys = []
for sen in preds:
    trunc_sen = []
    for t in sen:
        i = t.argmax()
        if i in (0, 1): # MASKING or BOS
            continue
        if i == 2: # EOS
            break
        trunc_sen += [token_to_word_fr_with_special_tokens[i]]
    sys += [" ".join(trunc_sen)]
    

In [61]:
i = np.random.randint(low=0, high=len(preds), size=1)[0]
english[i], refs[i], sys[i]

('that is why we ask the house to support our amendment earmarking eur 500 million for iraq',
 'C ’ est pourquoi nous demandons à l ’ Assemblée de soutenir notre amendement prévoyant de réserver 500 millions d ’ euros au profit de l ’ Irak .',
 "Le ' est pourquoi je avons que la ' amendement , la le politique et , la , ou de ' une par niveau de la ' Union .")

In [62]:
import sacrebleu

bleu_scores = []
for i in range(len(sys)):
    bleu_scores += [sacrebleu.corpus_bleu(sys[i], refs[i]).score]
    
np.mean(bleu_scores)

6.853537378315519