In [1]:
# basic libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# cleaning data
import re
import os
import nltk
nltk.download("stopwords")
nltk.download('punkt')

# save vocabulary in files
import pickle

# tokenization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model
from tensorflow.keras.layers import LSTM,Embedding,Input,Dense,SpatialDropout1D,Activation , Conv1D , GlobalMaxPooling1D
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Concatenate , Flatten ,Reshape
from keras.optimizers import Adam
from tensorflow.keras.models import Model,Sequential

# training model dependanices
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df=pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
df.columns=["english","frensh"]
df.head()

Unnamed: 0,english,frensh
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  175621 non-null  object
 1   frensh   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [4]:
data=df[:]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  175621 non-null  object
 1   frensh   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [5]:
# clean english column
def clean_english(text):
  text=text.lower() # lower case

  # remove any characters not a-z and ?!,'
  text=re.sub(u"[^a-z!?',]"," ",text)

  # word tokenization
  text=nltk.word_tokenize(text)

  # join text
  text=" ".join([i.strip() for i in text])

  return text
clean_english(data.iloc[0,0])

'hi'

In [6]:
data.iloc[1,0],clean_english(data.iloc[1,0])

('Run!', 'run !')

In [7]:
# clean frensh language
def clean_frensh(text):
  text=text.lower() # lower case

  # remove any characters not a-z and ?!,'
  # characters a-z and (éâàçêêëôîû) chars of frensh lang which contain accent
  text=re.sub(u"[^a-zéâàçêêëôîû!?',]"," ",text)

  return text
clean_frensh(data.iloc[0,1])

'salut!'

In [8]:
data.iloc[4,1],clean_frensh(data.iloc[4,1])

('Ça alors\u202f!', 'ça alors !')

In [9]:
# i show this two functions are ready to apply in dataframe
data["english"]=data["english"].apply(lambda txt:clean_english(txt))
data["frensh"]=data["frensh"].apply(lambda txt:clean_frensh(txt))

In [10]:
# add <start> <end> token to decoder sentence (Frensh)
data["frensh"]=data["frensh"].apply(lambda txt:f"<start> {txt} <end>")

In [11]:
data.sample(10)

Unnamed: 0,english,frensh
93756,what 've you been doing today ?,<start> qu'avez vous fait aujourd'hui ? <end>
60488,his dog is barking at me,<start> son chien m'aboie dessus <end>
74752,he has been gaining weight,<start> il a pris du poids <end>
79840,what happened at the beach ?,<start> que s'est il passé à la plage ? <end>
24671,i 'm letting you go,<start> je te lib re <end>
27474,you 'll bounce back,<start> vous vous en remettrez <end>
99340,this problem is not avoidable,<start> ce probl me est inévitable <end>
163562,she raised an important objection to his argument,<start> elle souleva une objection importante ...
170601,you should get yourself examined by the doctor...,<start> tu devrais immédiatement aller te fair...
119757,i do n't know if i should tell you,<start> c'est moi qui ignore si je devrais te ...


In [12]:
# english tokenizer
english_tokenize=Tokenizer(filters='#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
english_tokenize.fit_on_texts(data["english"])

In [13]:
num_encoder_tokens=len(english_tokenize.word_index)
num_encoder_tokens

13904

In [14]:
encoder=english_tokenize.texts_to_sequences(data["english"])
encoder[:5]

[[2745], [408, 124], [408, 124], [77, 5], [3483, 124]]

In [15]:
max_encoder_sequence_len=np.max([len(enc) for enc in encoder])
max_encoder_sequence_len

47

In [16]:
# frensh tokenizer
french_tokenize=Tokenizer(filters="#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n")
french_tokenize.fit_on_texts(data["frensh"])

In [17]:
num_decoder_tokens=len(french_tokenize.word_index)
num_decoder_tokens

26942

In [18]:
decoder=french_tokenize.texts_to_sequences(data["frensh"])
decoder[:5]

[[2, 16889, 1],
 [2, 572, 33, 1],
 [2, 5116, 33, 1],
 [2, 39, 6, 1],
 [2, 32, 393, 33, 1]]

In [19]:
max_decoder_sequence_len=np.max([len(dec) for dec in decoder])
max_decoder_sequence_len

57

In [20]:
idx_2_txt_decoder={k:i for i,k in french_tokenize.word_index.items()}
idx_2_txt_decoder[1]

'end'

In [21]:
idx_2_txt_encoder={k:i for i,k in english_tokenize.word_index.items()}
idx_2_txt_encoder[2]

'you'

In [22]:
idx_2_txt_decoder[0]="<pad>"
idx_2_txt_encoder[0]="<pad>"

In [23]:
encoder_seq=pad_sequences(encoder,maxlen=max_encoder_sequence_len,padding="post")
encoder_seq.shape

(175621, 47)

In [24]:
decoder_inp=pad_sequences([arr[:-1] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_inp.shape

(175621, 57)

In [25]:
decoder_output=pad_sequences([arr[1:] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_output.shape

(175621, 57)

In [26]:
print([idx_2_txt_decoder[i] for i in decoder_output[0]])

['salut!', 'end', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [27]:
print([idx_2_txt_encoder[i] for i in encoder_seq[0]])

['hi', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


 # **LSTM**

In [None]:
# encoder model
encoder_input=Input(shape=(None,),name="encoder_input_layer")
encoder_embedding=Embedding(num_encoder_tokens,300,input_length=max_encoder_sequence_len,name="encoder_embedding_layer")(encoder_input)
encoder_lstm=LSTM(256,activation="tanh",return_sequences=True,return_state=True,name="encoder_lstm_1_layer")(encoder_embedding)
encoder_lstm2=LSTM(256,activation="tanh",return_state=True,name="encoder_lstm_2_layer")(encoder_lstm)
_,state_h,state_c=encoder_lstm2
encoder_states=[state_h,state_c]

In [None]:
# decoder model
decoder_input=Input(shape=(None,),name="decoder_input_layer")
decoder_embedding=Embedding(num_decoder_tokens,300,input_length=max_decoder_sequence_len,name="decoder_embedding_layer")(decoder_input)
decoder_lstm=LSTM(256,activation="tanh",return_state=True,return_sequences=True,name="decoder_lstm_layer")
decoder_outputs,_,_=decoder_lstm(decoder_embedding,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens+1,activation="softmax",name="deocer_final_layer")
outputs=decoder_dense(decoder_outputs)

In [None]:
model=Model([encoder_input,decoder_input],outputs)
model.summary()

In [None]:
encoder_seq.shape,decoder_inp.shape,decoder_output.shape

In [None]:

model.compile(optimizer='rmsprop', loss=tf.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history=model.fit(
    [encoder_seq,decoder_inp],
    decoder_output,
    epochs=10,
    batch_size=450,
    # callbacks=[callback]
)

In [None]:
model.save("/kaggle/working/Translate_Eng_FR.h5")

In [None]:
model.save_weights("/kaggle/working/model_NMT")

# **GRU**

In [28]:
# Encoder model
encoder_input = Input(shape=(None,), name="encoder_input_layer")
encoder_embedding = Embedding(num_encoder_tokens, 300, input_length=max_encoder_sequence_len, name="encoder_embedding_layer")(encoder_input)
encoder_gru = GRU(256, activation="tanh", return_sequences=True, return_state=True, name="encoder_gru_1_layer")(encoder_embedding)
_, state_h = encoder_gru
encoder_states = [state_h]  # Only one state for GRU


In [29]:
# Decoder model
decoder_input = Input(shape=(None,), name="decoder_in\
put_layer")
decoder_embedding = Embedding(num_decoder_tokens, 300, input_length=max_decoder_sequence_len, name="decoder_embedding_layer")(decoder_input)
decoder_gru = GRU(256, activation="tanh", return_state=True, return_sequences=True, name="decoder_gru_layer")
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens + 1, activation="softmax", name="decoder_final_layer")
outputs = decoder_dense(decoder_outputs)


In [30]:
# Build and compile the model
model = Model([encoder_input, decoder_input], outputs)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 decoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, None, 300)            4171200   ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                    

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Assuming you have data (encoder_seq, decoder_inp, decoder_output)
history = model.fit(
    [encoder_seq, decoder_inp],
    decoder_output,
    epochs=10,
    batch_size=400,
    # callbacks=[callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10