In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

##Load Data

In [None]:

mark_start = 'sos '
mark_end = ' eos'

def load_data(file_path):
    data_src = []
    data_dest = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                data_src.append(parts[0].strip())
                data_dest.append(mark_start + parts[1].strip() + mark_end)
    return data_src, data_dest


#Tokenizate

In [None]:

def tokenize_sentences(sentences, num_words=None):
    tokenizer = Tokenizer(num_words=num_words, filters='', lower=True, oov_token='<unk>')
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    return tokenizer, sequences

# Load data
file_path = "/content/ara.txt"
data_src, data_dest = load_data(file_path)

num_words = 10000
english_tokenizer, english_sequences = tokenize_sentences(data_src, num_words=num_words)
arabic_tokenizer, arabic_sequences = tokenize_sentences(data_dest, num_words=num_words)


english_vocab_size = len(english_tokenizer.word_index) + 1
arabic_vocab_size = len(arabic_tokenizer.word_index) + 1

#Pad sequences

In [None]:

max_english_len = max(len(seq) for seq in english_sequences)
max_arabic_len = max(len(seq) for seq in arabic_sequences)

english_sequences = pad_sequences(english_sequences, maxlen=max_english_len, padding='post')
arabic_sequences = pad_sequences(arabic_sequences, maxlen=max_arabic_len, padding='post')

#Prepare inputs and outputs and some other parameters

In [None]:

decoder_input_data = arabic_sequences[:, :-1]
decoder_output_data = arabic_sequences[:, 1:]

embedding_size = 128
state_size = 256


#Build the NN

In [None]:

# Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=embedding_size)(encoder_input)
encoder_gru = GRU(state_size, return_state=True)
encoder_output, state_h = encoder_gru(encoder_embedding)

# Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=arabic_vocab_size, output_dim=embedding_size)(decoder_input)
decoder_gru = GRU(state_size, return_sequences=True, return_state=True)
decoder_output, _ = decoder_gru(decoder_embedding, initial_state=state_h)
decoder_dense = Dense(arabic_vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_output)


model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer=optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


callbacks = [
    EarlyStopping(monitor='val_loss', patience=3),
    ModelCheckpoint('best_model.keras', save_best_only=True)
]


#Train then Translate

In [8]:

model.fit(
    [english_sequences, decoder_input_data],
    np.expand_dims(decoder_output_data, -1),
    batch_size=64,
    epochs=10,
    validation_split=0.2,
    callbacks=callbacks
)

encoder_model = Model(encoder_input, state_h)

decoder_state_input = Input(shape=(state_size,))
decoder_output, state_h = decoder_gru(decoder_embedding, initial_state=decoder_state_input)
decoder_output = decoder_dense(decoder_output)
decoder_model = Model([decoder_input, decoder_state_input], [decoder_output, state_h])
def translate_to_arabic(input_sentence, encoder_model, decoder_model, english_tokenizer, arabic_tokenizer, max_english_len, max_arabic_len):
    input_sequence = english_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_english_len, padding='post')

    thought_vector = encoder_model.predict(input_sequence)

    start_token = arabic_tokenizer.word_index['sos']
    end_token = arabic_tokenizer.word_index['eos']

    decoder_input = np.array([[start_token]])
    translated_sentence = []

    for _ in range(max_arabic_len - 1):
        predictions, thought_vector = decoder_model.predict([decoder_input, thought_vector])
        word_index = np.argmax(predictions[0, -1, :])

        if word_index == end_token:
            break

        word = arabic_tokenizer.index_word.get(word_index, '')
        translated_sentence.append(word)

        decoder_input = np.array([[word_index]])

    return ' '.join(translated_sentence)

# Test translation
input_sentence = "Hello, how are you?"
translated_sentence = translate_to_arabic(input_sentence, encoder_model, decoder_model, english_tokenizer, arabic_tokenizer, max_english_len, max_arabic_len)
print("Translated Sentence:", translated_sentence)

Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 4s/step - accuracy: 0.8444 - loss: 3.1510 - val_accuracy: 0.8109 - val_loss: 1.3673
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m624s[0m 4s/step - accuracy: 0.9004 - loss: 0.8002 - val_accuracy: 0.8225 - val_loss: 1.3356
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m589s[0m 4s/step - accuracy: 0.9028 - loss: 0.7513 - val_accuracy: 0.8304 - val_loss: 1.3022
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m590s[0m 4s/step - accuracy: 0.9050 - loss: 0.7136 - val_accuracy: 0.8367 - val_loss: 1.2626
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 4s/step - accuracy: 0.9059 - loss: 0.6829 - val_accuracy: 0.8367 - val_loss: 1.2933
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 4s/step - accuracy: 0.9067 - loss: 0.6557 - val_accuracy: 0.8403 - val_loss: 1.2806
Epoch 7/10
[1m157/157



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Translated Sentence: هل أنت بخير؟


In [10]:
input_sentence = "Man is a social animal – Aristotle, the legendary Greek philosopher"
translated_sentence = translate_to_arabic(input_sentence, encoder_model, decoder_model, english_tokenizer, arabic_tokenizer, max_english_len, max_arabic_len)
print("Translated Sentence:", translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Translated Sentence: <unk> <unk> <unk> <unk>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Translated Sentence: <unk> <unk> <unk> <unk>


In [11]:
input_sentence = "Hell is other people"
translated_sentence = translate_to_arabic(input_sentence, encoder_model, decoder_model, english_tokenizer, arabic_tokenizer, max_english_len, max_arabic_len)
print("Translated Sentence:", translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Translated Sentence: أنا أنت في غاية ؟
