In [1]:
!pip install spacy
!python -m spacy download fr_core_news_sm
import numpy as np
import pandas as pd
import fr_core_news_sm
import en_core_web_sm
import tensorflow as tf
tf.__version__

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


'2.17.0'

In [4]:
# Loading data
doc = pd.read_csv("https://go.aws/38ECHUB", delimiter="\t", header=None)
doc.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !


In [5]:
doc.shape

(160538, 2)

In [6]:
# Let's just take a sample of 5000 sentences to avoid slowness
doc = doc.sample(5000)

# Loading of the entire corpus of French and English sentences
fr_corpus = " ".join(doc.iloc[:, 1].to_list())
en_corpus = " ".join(doc.iloc[:, 0].to_list())


In [7]:
# Loading both corpora into spacy
nlp_fr = fr_core_news_sm.load()
nlp_fr.max_length = len(fr_corpus)

nlp_en = en_core_web_sm.load()
nlp_en.max_length = len(en_corpus)

fr_doc = nlp_fr(fr_corpus)
en_doc = nlp_en(en_corpus)

In [8]:
# Tokenization of each sentence via spacy
doc["fr_tokens"] = doc.iloc[:, 1].apply(nlp_fr.tokenizer)
doc["en_tokens"] = doc.iloc[:, 0].apply(nlp_en.tokenizer)

In [9]:
doc.tail()

Unnamed: 0,0,1,fr_tokens,en_tokens
98037,Connect the two cables together.,Connecte ensemble les deux câbles.,"(Connecte, ensemble, les, deux, câbles, .)","(Connect, the, two, cables, together, .)"
14143,I visited Boston.,J'ai visité Boston.,"(J', ai, visité, Boston, .)","(I, visited, Boston, .)"
55163,Could I get my ring back?,Pourrais-je récupérer ma bague ?,"(Pourrais, -je, récupérer, ma, bague, , ?)","(Could, I, get, my, ring, back, ?)"
100809,My family is not all that large.,Ma famille n'est pas aussi grande que ça.,"(Ma, famille, n', est, pas, aussi, grande, que...","(My, family, is, not, all, that, large, .)"
83553,Let me bring you up to speed.,Laisse-moi te mettre au courant.,"(Laisse, -moi, te, mettre, au, courant, .)","(Let, me, bring, you, up, to, speed, .)"


In [10]:

# Creation of a set() that will take all the unique tokens from our text corpus
en_tokens = [token.text for token in en_doc]
en_vocabulary_set= set(en_tokens)
en_vocab_size = len(en_vocabulary_set)
print(en_vocab_size)

3527


In [11]:
# Same thing for French
fr_tokens = [token.text for token in fr_doc]
fr_vocabulary_set= set(fr_tokens)
fr_vocab_size = len(fr_vocabulary_set)
print(fr_vocab_size)

4979


In [12]:
# Creation of an id for each token
all_en_tokens = {en_token: i + 1 for i, en_token in enumerate(en_vocabulary_set)}
all_fr_tokens = {fr_token: i + 1 for i, fr_token in enumerate(fr_vocabulary_set)}
# RQ: We take at i+1 to leave the value 0 for the creation of the padded_sequences

In [13]:
# Creation of functions that will create a vector of indices for each of the token sequences
def en_tokens_to_index(tokens):
    return [all_en_tokens[token.text] for token in tokens]

def fr_tokens_to_index(tokens):
    return [all_fr_tokens[token.text] for token in tokens]

In [14]:
# Transformation of tokens into indices
doc["fr_indices"] = doc["fr_tokens"].apply(fr_tokens_to_index)
doc["en_indices"] = doc["en_tokens"].apply(en_tokens_to_index)

In [15]:
doc.tail()

Unnamed: 0,0,1,fr_tokens,en_tokens,fr_indices,en_indices
98037,Connect the two cables together.,Connecte ensemble les deux câbles.,"(Connecte, ensemble, les, deux, câbles, .)","(Connect, the, two, cables, together, .)","[3783, 2700, 2171, 2377, 4455, 2973]","[2993, 2793, 3333, 186, 2342, 1352]"
14143,I visited Boston.,J'ai visité Boston.,"(J', ai, visité, Boston, .)","(I, visited, Boston, .)","[2186, 1647, 4583, 4041, 2973]","[1404, 850, 839, 1352]"
55163,Could I get my ring back?,Pourrais-je récupérer ma bague ?,"(Pourrais, -je, récupérer, ma, bague, , ?)","(Could, I, get, my, ring, back, ?)","[2662, 3215, 3849, 2537, 1998, 4118, 1467]","[1929, 1404, 1805, 3040, 3251, 39, 605]"
100809,My family is not all that large.,Ma famille n'est pas aussi grande que ça.,"(Ma, famille, n', est, pas, aussi, grande, que...","(My, family, is, not, all, that, large, .)","[115, 3161, 3681, 4621, 2411, 3970, 2358, 444,...","[372, 3349, 2243, 1219, 1460, 3047, 2281, 1352]"
83553,Let me bring you up to speed.,Laisse-moi te mettre au courant.,"(Laisse, -moi, te, mettre, au, courant, .)","(Let, me, bring, you, up, to, speed, .)","[626, 926, 660, 2115, 3736, 2176, 2973]","[1405, 530, 1905, 2398, 2175, 675, 231, 1352]"


In [16]:
# Use of Keras to create token sequences of the same length
padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["fr_indices"], padding="post")
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["en_indices"], padding="post")

In [17]:
# Visualization of the shape of one of the tensors
padded_fr_indices.shape

(5000, 42)

In [18]:
padded_en_indices.shape

(5000, 37)

In [19]:
doc["fr_indices"].apply(len).max()
doc["en_indices"].apply(len).max()

37

In [20]:
# Application of the categorization of the target variable
binarized_en_indices = tf.keras.utils.to_categorical(padded_en_indices, num_classes=en_vocab_size+1)
binarized_en_indices.shape



(5000, 37, 3528)

In [None]:
# Creation of tf.data.Dataset for each of the French and English tensors
fr_ds = tf.data.Dataset.from_tensor_slices(padded_fr_indices)
en_ds = tf.data.Dataset.from_tensor_slices(binarized_en_indices)

In [1]:
# Create a complete tensorflow dataset
tf_ds = tf.data.Dataset.zip((fr_ds, en_ds))

NameError: name 'tf' is not defined

In [1]:
next(iter(tf_ds))

NameError: name 'tf_ds' is not defined

In [None]:
# Shuffle & Batch
BATCH_SIZE = 32

tf_ds = tf_ds.shuffle(len(doc)).batch(BATCH_SIZE)

In [None]:
# Train Test Split
TAKE_SIZE = int(0.7 * len(doc) / BATCH_SIZE)

train_data = tf_ds.take(TAKE_SIZE)
test_data = tf_ds.skip(TAKE_SIZE)

In [None]:
# Create the model
model = tf.keras.Sequential([
                  # Input Word Embedding layer
                  tf.keras.layers.Embedding(fr_vocab_size + 1, 64, mask_zero=True),

                  # LSTM Bidirectional layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),

                  # LSTM Bidirectionnal new layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),

                  # Repeat Vector
                  tf.keras.layers.RepeatVector(binarized_en_indices.shape[1]),

                  # LSTM new layer
                  tf.keras.layers.LSTM(32, return_sequences=True),

                  # Output layer with number of output neurons equal to class number with softmax function
                  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(en_vocab_size+1, activation="softmax"))

])

In [None]:
model.summary()

In [None]:
# "Random" prediction to test our model
input_text, output_text = next(iter(train_data))
print(input_text.numpy().shape)
print(model.predict(input_text).shape)
print(output_text.numpy().shape)

In [None]:
# Let's create a learning rate schedule to decrease the learning rate as we train the model
initial_learning_rate = 0.001

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1090,
    decay_rate=0.96,
    staircase=True)

# Using a simple compiler with an Adam optimizer to compute our gradients
optimizer= tf.keras.optimizers.Adam(
    learning_rate = lr_schedule
)

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
# Application of the model on 200 epochs
history = model.fit(train_data,
                    validation_data=test_data,
                    epochs=200)

In [None]:
# Testing a translation
for input_text, translation in test_data.take(1):
    pred = np.argmax(model.predict(input_text), axis=-1)

In [None]:
# CAUTION: it's possible because there is only one key for each value
indice_to_fr_token = dict(zip(all_fr_tokens.values(), all_fr_tokens.keys()))
indice_to_en_token = dict(zip(all_fr_tokens.values(), all_en_tokens.keys()))

In [None]:
# French Sentence
for indice in input_text[0]:
    if indice == 0:
        break
    print(indice_to_fr_token[indice.numpy()])

In [None]:
# Real English Sentence
for indice in np.argmax(translation, axis=-1)[0]:
    if indice == 0:
        break
    print(indice_to_en_token[indice])

In [None]:
# Sentence translated into English by the model
for indice in pred[0]:
    if indice == 0:
        break
    print(indice_to_en_token[indice])

In [None]:
# Training on 500 more epochs
history_2 = model.fit(train_data,
                      validation_data=test_data,
                      epochs=500)

In [None]:
# Testing on new translations

for input_text, translation in test_data.take(1):
    pred = np.argmax(model.predict(input_text), axis=-1)

# French
print("Input Sentence:", end=" ")
for indice in input_text[2]:
    if indice == 0:
        break
    print(indice_to_fr_token[indice.numpy()], end=" ")

# True
print("\nTrue Translation:", end=" ")
for indice in np.argmax(translation, axis=-1)[2]:
    if indice == 0:
        break
    print(indice_to_en_token[indice], end=" ")

# Pred
print("\nModel Translation:", end=" ")
for indice in pred[2]:
    if indice == 0:
        break
    print(indice_to_en_token[indice], end=" ")