In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/englishrussian-dictionary-for-machine-translate/rus.txt


> Clean Text

In [3]:
# load text 

filename = '/kaggle/input/englishrussian-dictionary-for-machine-translate/rus.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

In [4]:
from pickle import dump

# load doc into memory
def load_doc(filename):
    with open(filename, mode='rt', encoding='utf-8') as file:
        text = file.read()
    return text

In [5]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

> Split Text

In [6]:
# load dataset
filename = '/kaggle/input/englishrussian-dictionary-for-machine-translate/rus.txt'
doc = load_doc(filename)
# split into english-arab pairs
pairs = to_pairs(doc)

In [7]:
# save pairs to file
output_file = '/kaggle/working/english-russian.pkl'
with open(output_file, 'wb') as f:
    dump(pairs, f)
    print('Saved: %s' % output_file)

Saved: /kaggle/working/english-russian.pkl


In [8]:
import numpy as np
from pickle import load, dump

In [9]:
# load a clean dataset
def load_clean_sentences(filename):
    with open(filename, 'rb') as file:
        sentences = load(file)
    return sentences

In [10]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    with open(filename, 'wb') as file:
        dump(sentences, file)
    print('Saved: %s' % filename)

In [11]:
# load dataset
raw_dataset = load_clean_sentences('/kaggle/working/english-russian.pkl')

In [12]:
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences]

In [13]:
# random shuffle
np.random.shuffle(dataset)

In [14]:
# split into train/test
train, test = dataset[:9000], dataset[9000:]

In [15]:
# save
save_clean_data(dataset, '/kaggle/working/english-russian-both.pkl')
save_clean_data(train, '/kaggle/working/english-russian-train.pkl')
save_clean_data(test, '/kaggle/working/english-russian-test.pkl')

Saved: /kaggle/working/english-russian-both.pkl
Saved: /kaggle/working/english-russian-train.pkl
Saved: /kaggle/working/english-russian-test.pkl


> Train Neural Translation Model

In [16]:
# load a clean dataset
def load_clean_sentences(filename):
 return load(open(filename, 'rb'))

In [17]:
# load datasets
dataset = load_clean_sentences('english-russian-both.pkl')
train = load_clean_sentences('english-russian-train.pkl')
test = load_clean_sentences('english-russian-test.pkl')

In [18]:
# fit a tokenizer
def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer

In [19]:
# max sentence length
def max_length(lines):
 return max(len(line.split()) for line in lines)

In [20]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
# Define create_tokenizer and max_length functions
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [22]:
# Convert dataset to numpy array
dataset = np.array(dataset)

In [23]:
# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % eng_length)

English Vocabulary Size: 1766
English Max Length: 5


In [25]:
# Prepare Russian tokenizer
rus_tokenizer = create_tokenizer(dataset[:, 1])
rus_vocab_size = len(rus_tokenizer.word_index) + 1
rus_length = max_length(dataset[:, 1])
print('Russian Vocabulary Size: %d' % rus_vocab_size)
print('Russian Max Length: %d' % rus_length)

Russian Vocabulary Size: 4640
Russian Max Length: 10


In [28]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
 # integer encode sequences
 X = tokenizer.texts_to_sequences(lines)
 # pad sequences with 0 values
 X = pad_sequences(X, maxlen=length, padding='post')
 return X

In [29]:
from numpy import array
from tensorflow.keras.utils import to_categorical

In [30]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [31]:
from numpy import array
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
# Convert train and test to numpy arrays
train = array(train)
test = array(test)

In [33]:
# Define encode_sequences and encode_output functions if not defined already
def encode_sequences(tokenizer, length, lines):
    # Integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # Pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    return y

In [34]:
# Prepare training data
trainX = encode_sequences(rus_tokenizer, rus_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [35]:
# Prepare validation data
testX = encode_sequences(rus_tokenizer, rus_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.utils import plot_model

In [37]:
# Define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [38]:
# Define model
model = define_model(rus_vocab_size, eng_vocab_size, rus_length, eng_length, 256)

2024-04-04 22:54:40.713797: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.713918: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.713991: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.714062: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.714135: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.714371: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-04 22:54:40.714482: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this 

In [39]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [40]:
# Build the model
model.build((None, ar_length))  # Assume the input shape is (None, ger_length)

In [41]:
# Summarize defined model
print(model.summary())

None


In [42]:
# Plot model architecture
plot_model(model, to_file='model.png', show_shapes=True)

You must install pydot (`pip install pydot`) for `plot_model` to work.


> Evaluate Neural Translation Model

In [53]:
import numpy as np

In [54]:
# Load datasets
dataset = load_clean_sentences('/kaggle/working/english-russian-both.pkl')
train = load_clean_sentences('/kaggle/working/english-russian-train.pkl')
test = load_clean_sentences('/kaggle/working/english-russian-test.pkl')

In [55]:
# Convert train and test to NumPy arrays
train = np.array(train)
test = np.array(test)

In [56]:
# Convert dataset to a NumPy array
dataset = np.array(dataset)

In [57]:
# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [58]:
# Prepare Arabic tokenizer
rus_tokenizer = create_tokenizer(dataset[:, 1])
rus_vocab_size = len(rus_tokenizer.word_index) + 1
rus_length = max_length(dataset[:, 1])

In [59]:
# Prepare data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])  # Use eng_tokenizer here
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])  # Use eng_tokenizer here

In [60]:
from numpy import argmax

# Define function to map an integer to a word in the tokenizer's vocabulary
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# Generate target sequence given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [62]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [63]:
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [64]:
from nltk.translate.bleu_score import corpus_bleu
import pickle

# Read the translations from the pickle file
with open('/kaggle/working/english-russian-both.pkl', 'rb') as file:
    data = pickle.load(file)

references = []
hypotheses = []

# Check if data is a list of dictionaries or just a list
if isinstance(data, list):
    # Assume each entry is a list of strings (target, predicted)
    for entry in data:
        if isinstance(entry, list) and len(entry) >= 2:
            target, predicted = entry[:2]  # Take the first two elements
            references.append(target.split())
            hypotheses.append(predicted.split())
else:
    # Assume data is a dictionary
    target = data.get('target', '')
    predicted = data.get('predicted', '')
    references.append(target.split())
    hypotheses.append(predicted.split())

# Print translations for debugging (comment out if not needed)
for ref, hyp in zip(references, hypotheses):
    print(f"target={ref}, predicted={hyp}")

# Calculate BLEU scores
bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

print("\nBLEU scores:")
print(f"BLEU-1: {bleu1}")
print(f"BLEU-2: {bleu2}")
print(f"BLEU-3: {bleu3}")
print(f"BLEU-4: {bleu4}")

target=['Tom', 'is', 'wet.'], predicted=['Том', 'мокрый.']
target=['I', 'like', 'bread.'], predicted=['Я', 'люблю', 'хлеб.']
target=['Aim.', 'Fire!'], predicted=['Целься.', 'Огонь!']
target=['I', 'coughed.'], predicted=['Я', 'кашлянул.']
target=['I', 'got', 'carded.'], predicted=['У', 'меня', 'попросили', 'документы.']
target=['I', 'promised.'], predicted=['Я', 'пообещал.']
target=['Come', 'anytime.'], predicted=['Приходи', 'в', 'любое', 'время.']
target=['Go', 'to', 'bed.'], predicted=['Марш', 'в', 'постель!']
target=['Go', 'get', 'it.'], predicted=['Сходите', 'за', 'ним.']
target=['I', 'was', 'bored.'], predicted=['Я', 'скучал.']
target=['I', 'like', 'spring.'], predicted=['Люблю', 'весну.']
target=['I', "didn't", 'stop.'], predicted=['Я', 'не', 'остановился.']
target=['I', "couldn't", 'go.'], predicted=['Я', 'не', 'смог', 'пойти.']
target=["I'm", 'escaping.'], predicted=['Я', 'убегаю.']
target=['Tom', 'just', 'won.'], predicted=['Том', 'только', 'что', 'выиграл.']
target=['I', 'had'

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



BLEU scores:
BLEU-1: 3.08717472698574e-05
BLEU-2: 7.268381654782837e-157
BLEU-3: 2.4232727228595486e-205
BLEU-4: 1.1152596255413178e-232


What can i summarized that result bleu scores show these dataset have poor quality translations