Author: François Mercier

Goals: 
- Convert preprocessed into TF dataloader

# Imports

Additional requirements for this notebook (not part of main requirements)
```
pip install --no-index matplotlib 
pip install --no-index scikit-learn
pip install --no-index seaborn
pip install fastprogress
````

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import sklearn
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path
import pandas as pd
import numpy as np
import json
import pickle

from fastprogress import progress_bar

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.__version__

'2.0.0'

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [5]:
import sys
sys.path.append("..") # Require to have the utilities packages in path
from tools import tokenizer

In [6]:
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_rows', 999)

In [7]:
data_path = Path(r"/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020")
files = list(data_path.glob("*"))
files

[PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/token_to_word_en.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/train_lang1_en_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/word_to_token_fr.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/unaligned_fr_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/word_to_token_en.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/train_lang2_fr_numericalized.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/token_to_word_fr.pickle'),
 PosixPath('/project/cq-training-1/project2/teams/team03/data/preprocessed_15032020/unaligned_en_numericalized.pickle')]

# Bilingual dataloader

In [8]:
with open(data_path/"train_lang1_en_numericalized.pickle", 'rb') as handle:
    train_lang1_en_numericalized = pickle.load(handle)
    
with open(data_path/"train_lang2_fr_numericalized.pickle", 'rb') as handle:
    train_lang2_fr_numericalized = pickle.load(handle)
    
with open(data_path/"word_to_token_en.pickle", 'rb') as handle:
    word_to_token_en = pickle.load(handle)
    
with open(data_path/"word_to_token_fr.pickle", 'rb') as handle:
    word_to_token_fr = pickle.load(handle)

with open(data_path/"token_to_word_fr.pickle", 'rb') as handle:
    token_to_word_fr = pickle.load(handle)

In [9]:
gen_ds = zip(train_lang1_en_numericalized, train_lang2_fr_numericalized)

In [10]:
def my_generator(train_lang1_en_numericalized=train_lang1_en_numericalized, 
                 train_lang2_fr_numericalized=train_lang2_fr_numericalized,
                ):
    bos, eos = -2, -1
    for i in range(len(train_lang1_en_numericalized)):
        en = np.array([bos] + train_lang1_en_numericalized[i] + [eos]) + 3
        fr = np.array([bos] + train_lang2_fr_numericalized[i] + [eos]) + 3
        inputs = (en, 
                  fr)
        output = fr[1:]
        yield (inputs, output)

In [12]:
batch_size = 16

ds = tf.data.Dataset.from_generator(my_generator, 
                                    output_types=((tf.int32, tf.int32), tf.int32), 
                                    output_shapes=((tf.TensorShape([None]), tf.TensorShape([None])), 
                                                   tf.TensorShape([None])))
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
ds = ds.shuffle(seed=42, buffer_size=256)
#ds = ds.map(lambda x, y: ((tf.minimum(x[0], 10000 - 1), tf.minimum([1], 10000 - 1)), tf.minimum(y, 10000 - 1))) # Only to test performance with lower vocab size (and GPU mem)
ds = ds.padded_batch(batch_size=batch_size, padded_shapes=(([128], [128]), 128)) # Batch size for K20

# 5000 like XNLI https://www.nyu.edu/projects/bowman/xnli/
test_dataset = ds.take(int(5000 / batch_size))#.cache()
train_dataset = ds.skip(int(5000 / batch_size)).cache()


In [13]:
%%time
for element in test_dataset.take(1): 
    print(element[0][0].shape, element[0][1].shape, element[1].shape)

(16, 128) (16, 128) (16, 128)
CPU times: user 76.6 ms, sys: 10.6 ms, total: 87.2 ms
Wall time: 141 ms


In [14]:
len(word_to_token_fr)

91269

# Seq2Seq at word level

In [15]:
# hparams
latent_dim = 256

max_len = 128

vocab_size_en = len(word_to_token_en) + 3
vocab_size_fr = len(word_to_token_fr) + 3
#vocab_size_en = 10000
#vocab_size_fr = 10000



# Define an input sequence and process it.
encoder_inputs = tf.keras.layers.Input(shape=(max_len))
encoder_masked_inputs = tf.keras.layers.Masking()(encoder_inputs) # Assuming PAD is zeros
encoder_embeddings = tf.keras.layers.Embedding(vocab_size_en, 300)
encoder = tf.keras.layers.LSTM(latent_dim, return_state=True, name="encoder")
encoder_outputs, state_h, state_c = encoder(encoder_embeddings(encoder_masked_inputs))
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape=(max_len))
decoder_masked_inputs = tf.keras.layers.Masking()(decoder_inputs) # Assuming PAD is zeros
decoder_embeddings = tf.keras.layers.Embedding(vocab_size_fr, 300)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder")
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings(decoder_masked_inputs),
                                     initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(vocab_size_fr, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, 128)          0           input_1[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 128)          0           input_2[0][0]                    
______________________________________________________________________________________________

In [16]:
# Run training
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_dataset, validation_data=test_dataset, validation_steps=int(5000 / batch_size), epochs=10)

Epoch 1/10
Epoch 2/10
 24/376 [>.............................] - ETA: 7:14 - loss: 1.1154 - accuracy: 0.8417

KeyboardInterrupt: 

# BLEU score

In [17]:
refs = []
for sen in train_lang2_fr_numericalized:
    refs += [" ".join([token_to_word_fr[t] for t in sen])]

In [19]:
# For an unknown reason, for the predict, inputs must be float (unlike train)
preds = model.predict(test_dataset.take(1).map(lambda x, y: ((tf.cast(x[0], tf.float32), tf.cast(x[1], tf.float32)), tf.cast(y, tf.float32))))


token_to_word_fr_with_special_tokens = {(k+3): v for k, v in token_to_word_fr.items()}
token_to_word_fr_with_special_tokens[0] = "<MASK>"
token_to_word_fr_with_special_tokens[1] = "<BOS>"
token_to_word_fr_with_special_tokens[2] = "<EOS>"

sys = []
for sen in preds:
    sys += [" ".join([token_to_word_fr_with_special_tokens[t.argmax()] for t in sen if t.argmax() != 0])]

In [27]:
np.random.choice(refs), np.random.choice(sys)

("Vraiment , j ' étais le plus jeune membre de n ' importe quelle délégation dans la convention de 1980 qui a élu Ronald Reagan pour être le candidat Républicain pour la présidentielle .",
 "Il , ' est , , , le la la de la . . la la . . la ' est de . la la . . la la . . la . la ' est de . <EOS>")

In [21]:
len(refs), len(sys)

(11000, 16)

In [28]:
import sacrebleu

bleu_scores = []
for i in range(len(sys)):
    bleu_scores += [sacrebleu.corpus_bleu(sys[i], refs[i]).score]
    
np.mean(bleu_scores)

1.810378014096578