In [18]:
import numpy as np
from keras.layers import Input, LSTM, Dense
from keras.models import Model
import os
from pathlib import Path


In [2]:
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip -d fra-eng


--2024-07-13 06:13:49--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-07-13 06:13:49 (14.6 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]

Archive:  fra-eng.zip
  inflating: fra-eng/_about.txt      
  inflating: fra-eng/fra.txt         


In [3]:
batch_size=64
epochs=100
latent_dim=256
num_samples=1000
data_path=r"/content/fra-eng/fra.txt"

In [4]:
input_texts=[]
target_texts=[]
input_character=set()    #all chars in input data (english) set()--> to take uniques
target_character=set()   #all chars in target data (france) set()--> to take uniques


In [5]:
with open(data_path,'r',encoding='utf-8')as f:
    lines=f.read().split('\n')


In [6]:
len(lines)


232737

In [7]:
# num_samples = 10000
for line in lines[: min(num_samples, len(lines))]:
  input_text, target_text, _ = line.split("\t")
  target_text = "\t" + target_text + "\n"

  input_texts.append(input_text)
  target_texts.append(target_text)

  for char in input_text:
    if char not in input_character:
      input_character.add(char)
  for char in target_text:
    if char not in target_character:
      target_character.add(char)

In [8]:
input_character

{' ',
 '!',
 "'",
 ',',
 '.',
 '1',
 '9',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [9]:
for i in range(15):
  print(input_texts[i],target_texts[i])


Go. 	Va !

Go. 	Marche.

Go. 	En route !

Go. 	Bouge !

Hi. 	Salut !

Hi. 	Salut.

Run! 	Cours !

Run! 	Courez !

Run! 	Prenez vos jambes à vos cous !

Run! 	File !

Run! 	Filez !

Run! 	Cours !

Run! 	Fuyez !

Run! 	Fuyons !

Run. 	Cours !



In [10]:
input_character=sorted(list(input_character))
target_character=sorted(list(target_character))

num_encoder_tokens=len(input_character) #no. of characters
num_decoder_tokens=len(target_character)

In [11]:
num_encoder_tokens,num_decoder_tokens

(56, 76)

In [12]:
max_encoder_seq_length=max([len(txt) for txt in input_texts]) #to get max length of word
max_decoder_seq_length=max([len(txt) for txt in target_texts])

In [13]:
max([len(txt) for txt in input_texts])

10

In [14]:
max([len(txt) for txt in target_texts])

33

In [15]:
input_token_index=dict([(char,i) for i,char in enumerate(input_character)])
target_token_index=dict([(char,i) for i,char in enumerate(target_character)])

In [16]:
input_token_index

{' ': 0,
 '!': 1,
 "'": 2,
 ',': 3,
 '.': 4,
 '1': 5,
 '9': 6,
 '?': 7,
 'A': 8,
 'B': 9,
 'C': 10,
 'D': 11,
 'E': 12,
 'F': 13,
 'G': 14,
 'H': 15,
 'I': 16,
 'J': 17,
 'K': 18,
 'L': 19,
 'M': 20,
 'N': 21,
 'O': 22,
 'P': 23,
 'R': 24,
 'S': 25,
 'T': 26,
 'U': 27,
 'W': 28,
 'Y': 29,
 'a': 30,
 'b': 31,
 'c': 32,
 'd': 33,
 'e': 34,
 'f': 35,
 'g': 36,
 'h': 37,
 'i': 38,
 'j': 39,
 'k': 40,
 'l': 41,
 'm': 42,
 'n': 43,
 'o': 44,
 'p': 45,
 'q': 46,
 'r': 47,
 's': 48,
 't': 49,
 'u': 50,
 'v': 51,
 'w': 52,
 'x': 53,
 'y': 54,
 'z': 55}

In [17]:
#encoder_input(english text)
encoder_input_data=np.zeros(
    (len(input_texts),max_encoder_seq_length,num_encoder_tokens),dtype='float32')
#decoder_input(intial hidden state from encoder)
decoder_input_data=np.zeros(
    (len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
#decoder_output(french text)
decoder_target_data=np.zeros(
    (len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype='float32')

In [24]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i, t, input_token_index[char]] = 1.0  #get index of char and put 0 in it
  encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0 #padding

  for t, char in enumerate(target_text):
    decoder_input_data[i, t, target_token_index[char]] = 1.0
    if t > 0:
      decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
  decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
  decoder_target_data[i, t:, target_token_index[" "]]=1.0

In [25]:
#encoder layer
encoder_inputs = Input(shape=(None,num_encoder_tokens))
encoder=LSTM(latent_dim,return_state=True)

encoder_output , state_h , state_c =encoder(encoder_inputs)
encoder_states=[state_h,state_c]

In [26]:
#encoder layer
decoder_inputs = Input(shape=(None,num_decoder_tokens))
decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)

decoder_output,_,_=decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens,activation='softmax')
decoder_outputs=decoder_dense(decoder_output)

In [28]:
model=Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [29]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])


In [31]:
model.fit([encoder_input_data,decoder_input_data],
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7971aeec82b0>

In [32]:
model.save_weights('seq2seq_translatio_model.h5')

#Inference Sampling

In [33]:
# here is the drill :
# 1) encode input and retrive intial decoder state
# 2) run one step of decoder with this intial state
# and a "start sequence" token as target.
# output will be the next target token
# 3) repeat with the current target token and current states

#Define sampling models
encoder_model=Model(encoder_inputs,encoder_states)

In [34]:
decoder_state_input_h=Input(shape=(latent_dim,))
decoder_state_input_c=Input(shape=(latent_dim,))
decoder_states_inputs=[decoder_state_input_h,decoder_state_input_c]
decoder_outputs,state_h,state_c=decoder_lstm(decoder_inputs,initial_state=decoder_states_inputs)
decoder_states=[state_h,state_c]
decoder_outputs=decoder_dense(decoder_outputs)

decoder_model=Model([decoder_inputs]+decoder_states_inputs,
 [decoder_outputs]+decoder_states)

In [35]:
# Reverse-lookup token index to decode sequences back to something readable (index-->char)
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [36]:
def decode_sequence(input_seq):
  #encode the input as state vectors.
  states_value=encoder_model.predict(input_seq)

  #generate empty target sequence of length 1.
  target_seq=np.zeros((1,1,num_decoder_tokens))
  #populate the first charachter of target sequence with start character
  target_seq[0,0,target_token_index['\t']] =1.

  #sampling loop for a batch of sequences
  #(to simplify,here we assume a batch of size 1)
  stop_condition=False
  decoded_sentence=''
  while not stop_condition:
    output_tokens,h,c=decoder_model.predict(
        [target_seq]+states_value)

    #sample a token
    sampled_token_index=np.argmax(output_tokens[0,-1,:])
    sampled_char=reverse_target_char_index[sampled_token_index]
    decoded_sentence+=sampled_char

    #exit condition: either hit max length or find
    if(sampled_char=='\n' or
       len(decoded_sentence)>max_decoder_seq_length):
      stop_condition=True

    #update the target sequence(of length 1)
    target_seq=np.zeros((1,1,num_decoder_tokens))
    target_seq[0,0,sampled_token_index]=1.

    #update states
    states_value=[h,c]

  return decoded_sentence


In [38]:
for seq_index in range(50):
  #take one sequence (part of the training set)
  # for trying out decoding
  input_seq=encoder_input_data[seq_index:seq_index+1]
  decoded_sentence=decode_sequence(input_seq)
  print('-')
  print('Input sentence:',input_texts[seq_index])
  print('Decoded sentence:',decoded_sentence)




-
Input sentence: Go.
Decoded sentence: Va !

-
Input sentence: Go.
Decoded sentence: Va !

-
Input sentence: Go.
Decoded sentence: Va !

-
Input sentence: Go.
Decoded sentence: Va !

-
Input sentence: Hi.
Decoded sentence: Sontren !

-
Input sentence: Hi.
Decoded sentence: Sontren !

-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run!
Decoded sentence: Fuis-j                            
-
Input sentence: Run.
Decoded sentence: Fuis-j !

-
Input sentence: Run.
Decoded sentence: Fuis-j !

-
In