In [1]:
from tensorflow.keras.layers import Dense, Input, LSTM
from tensorflow.keras.models import Sequential
import numpy as np


In [2]:
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000
data_path = '/content/drive/MyDrive/AI/DL/Language_Translation/fra.txt'

In [3]:
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')
  

In [4]:
lines

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)',
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)',
 'Run!\tFile !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077454 (sacredceltic)',
 'Run!\tFilez !\tCC-BY 2.0 (France) Attribution: tatoeba

In [5]:
input_texts, target_texts = [], []
input_chars, target_chars = set(), set()

for line in lines[:min(len(lines),num_samples)-1]:
  input_text, target_text, _ = line.split('\t')
  target_text = '\t' + target_text + '\n'
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_chars:
      input_chars.add(char)
  for char in target_text:
    if char not in target_chars:
      target_chars.add(char)

  # print(input_text + '<=>' + target_text + '<=>' + _)

In [6]:
# input_texts
# target_texts
# input_chars
# target_chars

In [7]:
input_chars = sorted(list(input_chars))

In [8]:
target_chars = sorted(list(target_chars))

In [9]:
num_encoder_tokens = len(input_chars)
num_encoder_tokens

71

In [10]:
num_decoder_tokens = len(target_chars)
num_decoder_tokens

93

In [11]:
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])
max_encoder_seq_len, max_decoder_seq_len

(15, 59)

In [12]:
input_token_index = dict([(char,i) for i,char in enumerate(input_chars)])
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 'é': 70}

In [13]:
target_token_index = dict([(char,i) for i,char in enumerate(target_chars)])
target_token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '0': 13,
 '1': 14,
 '2': 15,
 '3': 16,
 '5': 17,
 '8': 18,
 '9': 19,
 ':': 20,
 '?': 21,
 'A': 22,
 'B': 23,
 'C': 24,
 'D': 25,
 'E': 26,
 'F': 27,
 'G': 28,
 'H': 29,
 'I': 30,
 'J': 31,
 'K': 32,
 'L': 33,
 'M': 34,
 'N': 35,
 'O': 36,
 'P': 37,
 'Q': 38,
 'R': 39,
 'S': 40,
 'T': 41,
 'U': 42,
 'V': 43,
 'Y': 44,
 'a': 45,
 'b': 46,
 'c': 47,
 'd': 48,
 'e': 49,
 'f': 50,
 'g': 51,
 'h': 52,
 'i': 53,
 'j': 54,
 'k': 55,
 'l': 56,
 'm': 57,
 'n': 58,
 'o': 59,
 'p': 60,
 'q': 61,
 'r': 62,
 's': 63,
 't': 64,
 'u': 65,
 'v': 66,
 'w': 67,
 'x': 68,
 'y': 69,
 'z': 70,
 '\xa0': 71,
 '«': 72,
 '»': 73,
 'À': 74,
 'Ç': 75,
 'É': 76,
 'Ê': 77,
 'à': 78,
 'â': 79,
 'ç': 80,
 'è': 81,
 'é': 82,
 'ê': 83,
 'î': 84,
 'ï': 85,
 'ô': 86,
 'ù': 87,
 'û': 88,
 'œ': 89,
 '\u2009': 90,
 '’': 91,
 '\u202f': 92}

In [14]:
encoder_input_data = np.zeros((len(input_texts),max_encoder_seq_len,num_encoder_tokens), dtype=np.float64)
encoder_input_data.shape

(9999, 15, 71)

In [15]:
decoder_input_data = np.zeros((len(input_texts),max_decoder_seq_len,num_decoder_tokens), dtype=np.float64)
decoder_input_data.shape

(9999, 59, 93)

In [16]:
decoder_target_data = np.zeros((len(input_texts),max_decoder_seq_len,num_decoder_tokens), dtype=np.float64)
decoder_target_data.shape

(9999, 59, 93)

In [17]:
for i, (input_text,target_text) in enumerate(zip(input_texts, target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]] = 1
  encoder_input_data[i,t+1:,input_token_index[' ']] = 1
  for t,char in enumerate(target_text):
    decoder_input_data[i,t,target_token_index[char]] = 1
    if t>0:
      decoder_target_data[i,t-1,target_token_index[char]] = 1
  decoder_input_data[i,t+1:,target_token_index[' ']] = 1
  decoder_target_data[i,t:,target_token_index[' ']] = 1


In [18]:
encoder_input_data.shape

(9999, 15, 71)

In [19]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))

In [20]:
encoder = LSTM(latent_dim,return_state=True)

In [21]:
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

In [22]:
encoder_states = [state_h, state_c]
encoder_states

[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm')>]

In [23]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))

In [24]:
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

In [25]:
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

In [26]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

In [27]:
decoder_outputs = decoder_dense(decoder_outputs)

In [28]:
from tensorflow.keras.models import Model

In [29]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [30]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fd023c93590>