In [106]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt


In [107]:
# load text
filename = '/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

In [108]:
from pickle import dump

# load doc into memory
def load_doc(filename):
    with open(filename, mode='rt', encoding='utf-8') as file:
        text = file.read()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

# load dataset
filename = '/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt'
doc = load_doc(filename)
# split into english-arab pairs
pairs = to_pairs(doc)

# save pairs to file
output_file = '/kaggle/working/english-arab.pkl'
with open(output_file, 'wb') as f:
    dump(pairs, f)
    print('Saved: %s' % output_file)


Saved: /kaggle/working/english-arab.pkl


In [109]:
import numpy as np
from pickle import load, dump

# load a clean dataset
def load_clean_sentences(filename):
    with open(filename, 'rb') as file:
        sentences = load(file)
    return sentences

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    with open(filename, 'wb') as file:
        dump(sentences, file)
    print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('/kaggle/working/english-arab.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences]
# random shuffle
np.random.shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]

# save
save_clean_data(dataset, '/kaggle/working/english-arab-both.pkl')
save_clean_data(train, '/kaggle/working/english-arab-train.pkl')
save_clean_data(test, '/kaggle/working/english-arab-test.pkl')

Saved: /kaggle/working/english-arab-both.pkl
Saved: /kaggle/working/english-arab-train.pkl
Saved: /kaggle/working/english-arab-test.pkl


> Train Neural Translation Model

In [110]:
# load a clean dataset
def load_clean_sentences(filename):
 return load(open(filename, 'rb'))
 
# load datasets
dataset = load_clean_sentences('english-arab-both.pkl')
train = load_clean_sentences('english-arab-train.pkl')
test = load_clean_sentences('english-arab-test.pkl')

In [111]:
# fit a tokenizer
def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer

In [112]:
# max sentence length
def max_length(lines):
 return max(len(line.split()) for line in lines)

In [113]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# Define create_tokenizer and max_length functions
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

# Convert dataset to numpy array
dataset = np.array(dataset)

# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % eng_length)

# Prepare Arab tokenizer
ar_tokenizer = create_tokenizer(dataset[:, 1])
ar_vocab_size = len(ar_tokenizer.word_index) + 1
ar_length = max_length(dataset[:, 1])
print('Arab Vocabulary Size: %d' % ar_vocab_size)
print('Arab Max Length: %d' % ar_length)

English Vocabulary Size: 3631
English Max Length: 11
Arab Vocabulary Size: 10520
Arab Max Length: 14


In [114]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
 # integer encode sequences
 X = tokenizer.texts_to_sequences(lines)
 # pad sequences with 0 values
 X = pad_sequences(X, maxlen=length, padding='post')
 return X

In [115]:
from numpy import array
from tensorflow.keras.utils import to_categorical

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [116]:
from numpy import array
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert train and test to numpy arrays
train = array(train)
test = array(test)

# Define encode_sequences and encode_output functions if not defined already
def encode_sequences(tokenizer, length, lines):
    # Integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # Pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    return y

# Prepare training data
trainX = encode_sequences(ar_tokenizer, ar_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# Prepare validation data
testX = encode_sequences(ar_tokenizer, ar_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [117]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.utils import plot_model

# Define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# Define model
model = define_model(ar_vocab_size, eng_vocab_size, ar_length, eng_length, 256)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Build the model
model.build((None, ar_length))  # Assume the input shape is (None, ger_length)

# Summarize defined model
print(model.summary())

# Plot model architecture
plot_model(model, to_file='model.png', show_shapes=True)

None
You must install pydot (`pip install pydot`) for `plot_model` to work.


In [98]:
!pip install pydot

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


> Evaluate Neural Translation Model

In [118]:
import numpy as np

# Load datasets
dataset = load_clean_sentences('/kaggle/working/english-arab-both.pkl')
train = load_clean_sentences('/kaggle/working/english-arab-train.pkl')
test = load_clean_sentences('/kaggle/working/english-arab-test.pkl')

# Convert train and test to NumPy arrays
train = np.array(train)
test = np.array(test)

# Convert dataset to a NumPy array
dataset = np.array(dataset)

# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# Prepare Arabic tokenizer
ar_tokenizer = create_tokenizer(dataset[:, 1])
ar_vocab_size = len(ar_tokenizer.word_index) + 1
ar_length = max_length(dataset[:, 1])

# Prepare data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])  # Use eng_tokenizer here
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])  # Use eng_tokenizer here

In [119]:
from numpy import argmax

# Define function to map an integer to a word in the tokenizer's vocabulary
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# Generate target sequence given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)


In [101]:
!pip install nltk

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [120]:
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [122]:
from nltk.translate.bleu_score import corpus_bleu

# Read predicted translations from a text file
with open('/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt', 'r', encoding='utf-8') as file:
    predicted_lines = file.readlines()

# Read reference translations from a text file
with open('/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt', 'r', encoding='utf-8') as file:
    reference_lines = file.readlines()

# Tokenize the lines
predicted_tokenized = [line.strip().split() for line in predicted_lines]
reference_tokenized = [[reference.strip().split()] for reference in reference_lines]

# Calculate BLEU score
bleu_score = corpus_bleu(reference_tokenized, predicted_tokenized)

print("BLEU Score:", bleu_score)

BLEU Score: 0.9998417472386191


In [129]:
from nltk.translate.bleu_score import corpus_bleu
import pickle

# Read the translations from the pickle file
with open('/kaggle/working/english-arab-both.pkl', 'rb') as file:
    data = pickle.load(file)

references = []
hypotheses = []

# Check if data is a list of dictionaries or just a list
if isinstance(data, list):
    # Assume each entry is a list of strings (target, predicted)
    for entry in data:
        if isinstance(entry, list) and len(entry) >= 2:
            target, predicted = entry[:2]  # Take the first two elements
            references.append(target.split())
            hypotheses.append(predicted.split())
else:
    # Assume data is a dictionary
    target = data.get('target', '')
    predicted = data.get('predicted', '')
    references.append(target.split())
    hypotheses.append(predicted.split())

# Print translations for debugging (comment out if not needed)
for ref, hyp in zip(references, hypotheses):
    print(f"target={ref}, predicted={hyp}")

# Calculate BLEU scores
bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

print("\nBLEU scores:")
print(f"BLEU-1: {bleu1}")
print(f"BLEU-2: {bleu2}")
print(f"BLEU-3: {bleu3}")
print(f"BLEU-4: {bleu4}")

target=['There', 'are', 'no', 'problems.'], predicted=['ليس', 'هناك', 'أي', 'مشاكل.']
target=['I', 'know', 'Tom', 'loves', 'Mary.'], predicted=['أنا', 'أعلم', 'أن', 'توم', 'يحب', 'ماري']
target=['He', 'struck', 'a', 'match.'], predicted=['أشعل', 'عود', 'ثقاب.']
target=['They', 'went', 'on', 'talking', 'for', 'hours.'], predicted=['استمروا', 'في', 'الحديث', 'لساعات.']
target=['If', 'I', 'were', 'you,', "I'd", 'follow', 'his', 'advice.'], predicted=['لو', 'كنت', 'مكانك', 'لأخذت', 'بنصيحته.']
target=['What', 'he', 'said', 'would', 'happen', 'has', 'happened.'], predicted=['حصل', 'ما', 'قال', 'أنه', 'سيحصل.']
target=['Tom', 'went', 'inside', 'the', 'apartment.'], predicted=['دخل', 'توم', 'الشقة.']
target=['Keep', 'your', 'eyes', 'open.'], predicted=['أبقِ', 'عينيك', 'مفتوحتين.']
target=['I', "don't", 'like', 'it,', 'either.'], predicted=['لا', 'يعجبني', 'ذلك', 'أيضاً.']
target=['Where', 'is', 'Tom?'], predicted=['أين', 'توم؟']
target=['I', 'live', 'in', 'Japan.'], predicted=['أنا', 'أعيش',

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



BLEU scores:
BLEU-1: 0.0022207251041744644
BLEU-2: 7.029422007696954e-156
BLEU-3: 1.1846515020366343e-204
BLEU-4: 3.9548682847525076e-232
