In [1]:
import numpy as np 
import pandas as pd 

In [2]:
train_t = pd.read_csv(r'C:\Users\Vishwarath Patil\Downloads\archive (13)\eng_-french.csv')

In [3]:
train_t.columns


Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [4]:
df = train_t[0:100000]
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
99995,We need it now more than ever.,Nous en avons maintenant besoin plus que jamais.
99996,We need more doctors like you.,Nous avons besoin de plus de médecins tels que...
99997,We need our demands to be met.,Il faut que nos exigences soient respectées.
99998,We need some more information.,Il nous faut plus d'informations.


In [5]:
english_text = df['English words/sentences']
french_text = df['French words/sentences']

In [6]:
import re
english = []
french = []
for i in range(len(english_text)):
    text = english_text[i].lower()
    text = re.sub('[^a-zA-Z]',' ',text)
    english.append(text)
    

for i in range(len(french_text)):
    ftext = french_text[i].lower()
    ftext = (re.sub("[^a-zA-Z' àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]",' ',ftext))
    french.append("START_ " + ftext + " _END")


In [7]:
french

['START_ salut  _END',
 'START_ cours   _END',
 'START_ courez   _END',
 'START_ qui   _END',
 'START_ ça alors   _END',
 'START_ au feu   _END',
 "START_ à l'aide   _END",
 'START_ saute  _END',
 'START_ ça suffit   _END',
 'START_ stop   _END',
 'START_ arrête toi   _END',
 'START_ attends   _END',
 'START_ attendez   _END',
 'START_ poursuis  _END',
 'START_ continuez  _END',
 'START_ poursuivez  _END',
 'START_ bonjour   _END',
 'START_ salut   _END',
 'START_ je comprends  _END',
 "START_ j'essaye  _END",
 "START_ j'ai gagné   _END",
 "START_ je l'ai emporté   _END",
 'START_ j ai gagné  _END',
 'START_ oh non   _END',
 'START_ attaque   _END',
 'START_ attaquez   _END',
 'START_ santé   _END',
 'START_ à votre santé   _END',
 'START_ merci   _END',
 'START_ tchin tchin   _END',
 'START_ lève toi  _END',
 'START_ va  maintenant  _END',
 'START_ allez y maintenant  _END',
 'START_ vas y maintenant  _END',
 "START_ j'ai pigé   _END",
 'START_ compris   _END',
 'START_ pigé   _END',


In [8]:
#Vocabulary of English
all_eng_words = set()
for i in english:
    for j in i.split():
        all_eng_words.add(j)

#vocabulary of french
all_fre_words = set()
for i in french:
    for j in i.split():
        all_fre_words.add(j)

#maxlen of the source sequence
max_length_src = 0
for i in english:
    a = len(i.split())
    if a>max_length_src:
        max_length_src = a
        
#maxlen of the target sequence
max_length_tar = 0
for j in french:
    b = len(j.split())
    if b>max_length_tar:
        max_length_tar = b
        

input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_fre_words))

# Calculate Vocab size for both source and targe
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_fre_words)


#indexs for input and target sequences
input_index = dict([(words,i) for i,words in enumerate(input_words)])
target_index = dict([(word, i) for i, word in enumerate(target_words)])

reverse_input_index = dict((i, word) for word, i in input_index.items())
reverse_target_index = dict((i, word) for word, i in target_index.items())

In [9]:
print(max_length_src)
print(max_length_tar)
print(num_encoder_tokens)
print(num_decoder_tokens)

9
17
8616
17160


In [10]:
encoder_input_data = np.zeros((100000, max_length_src, num_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((100000, max_length_tar, num_decoder_tokens),dtype='float32')
decoder_target_data = np.zeros((100000, max_length_tar, num_decoder_tokens),dtype='float32')

In [11]:
for j in range(100000):
    for i,text in enumerate(english[j].split()):
        encoder_input_data[j,i,input_index[text]] = 1.

for j in range(100000):
    for i,text in enumerate(french[j].split()):
        decoder_input_data[j,i,target_index[text]] = 1.
        if i>0:
            decoder_target_data[j,i-1,target_index[text]] = 1.

In [12]:
import keras, tensorflow
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional

Using TensorFlow backend.


In [13]:
batch_size = 64
epochs = 100
latent_dim = 256 #size of the lstms hidden state

In [14]:


#inputs for the encoder
encoder_inputs = Input(shape=(None,num_encoder_tokens))
#encoder lstm
encod_lstm = (LSTM(latent_dim,return_state = True))
encoder_output,state_h,state_c = encod_lstm(encoder_inputs)

#hidden from encoder to pass to the decoder as initial hidden state
encoder_states = [state_h,state_c]

#inputs for the decoder
decoder_inputs = Input(shape=(None,num_decoder_tokens))
#decoder lstm 
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_output,_,_= decoder_lstm(decoder_inputs,initial_state = encoder_states)

dense_layer = Dense(num_decoder_tokens, activation='softmax')
decoder_output = dense_layer(decoder_output)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_output)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 8616)   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 17160)  0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 9085952     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  17835008    input_2[0][0]                    
                                                                 lstm_1[0][1]               

In [16]:
model.fit([encoder_input_data,decoder_input_data],decoder_target_data,batch_size= 64,epochs= 50,validation_split=0.2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50

KeyboardInterrupt: 

In [17]:
encoder_model = Model(encoder_inputs,encoder_states)

decoder_state_h = Input(shape=(latent_dim,))
decoder_state_c = Input(shape=(latent_dim,))
decode_state = [decoder_state_h,decoder_state_c]

decoder_outputs,state_h,state_c = decoder_lstm(decoder_inputs,initial_state = decode_state)
decoder_states = [state_h, state_c]
decoder_outputs = dense_layer(decoder_outputs)

decoder_model = Model([decoder_inputs] + decode_state,[decoder_outputs] + decoder_states)

In [19]:
def decode_sequence(input_seq):
    # encode the input sequence to get the internal state vectors.
    states_value = encoder_model.predict(input_seq)
  
    # generate empty target sequence of length 1 with only the start character
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_index['START_']] = 1.
  
    # output sequence loop
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    
        # sample a token and add the corresponding character to the 
        # decoded sequence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_index[sampled_token_index]
        
        if (sampled_char == "_END" or len(decoded_sentence) > max_length_tar):
            stop_condition = True
            break
            
        decoded_sentence += sampled_char
        decoded_sentence +=' '
      
        # update the target sequence (length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
    
        # update states
        states_value = [h, c]
    
    return decoded_sentence
            

In [20]:
toks = ['i love you','run fast','she is the client','my name is tom']
for t in toks:
    input_sentence = t
    test_sentence_tokenized = np.zeros((1, max_length_src, num_encoder_tokens), dtype='float32')
    for t, char in enumerate(input_sentence.split()):
        test_sentence_tokenized[0, t, input_index[char]] = 1.
    print(input_sentence)
    print(decode_sequence(test_sentence_tokenized))
    print(' ')

i love you
je vous aime 
 
run fast
un fait vite 
 
she is the client
elle est dans le tu 
 
my name is tom
mon nom est tom 
 
