In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, RepeatVector
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import random
from keras.models import Sequential
from numpy.core.fromnumeric import size
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate import meteor_score
nltk.download('wordnet')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
model_path = "/content/drive/MyDrive/modelFinal/trained_model.h5"
model = tf.keras.models.load_model(model_path)

In [4]:
def clean_data(text):
    text = text.replace('\n', ' ')  # remove newline
    text = text.replace('/', ' ')  # remove forward slashes
    text = re.sub(r'\s+', ' ', text)  # replace multiple whitespace with a single space
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß ]', '', text)  # remove non-alphanumeric characters
    text = text.lower()

    return text

In [5]:
df_train = pd.read_csv("/content/drive/MyDrive/df_train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/df_test.csv")
df_val = pd.read_csv("/content/drive/MyDrive/df_val.csv")

train = df_train.values.tolist()
test = df_test.values.tolist()
val = df_val.values.tolist()


In [6]:
# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [7]:
en_sentences = []
for sentence in train:
    en_sentences.append(sentence[0])
en_tokenizer = tokenization(en_sentences)
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = 15 #Histogram
print('English Vocab: %d' % en_vocab_size)

English Vocab: 14341


In [8]:
de_sentences = []
for sentence in train:
    de_sentences.append(sentence[1])
de_tokenizer = tokenization(de_sentences)
de_vocab_size = len(de_tokenizer.word_index) + 1
de_length = 15 #Hisogram
print('German Vocab: %d' % de_vocab_size)

German Vocab: 28493


In [9]:
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [22]:
trainX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in train])
trainY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in train])

testX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in test])
testY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in test])

valX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in val])
valY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in val])

In [11]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [23]:
#testing if model load works
#Create subset of data to avoid memory issues
testX = testX[0:1000]
testY = testY[0:1000]

preds = np.argmax(model.predict(testX.reshape((testX.shape[0],testX.shape[1]))), axis=-1)

preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], de_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], de_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)

    preds_text.append(' '.join(temp))




In [24]:
initial_sentences = [sentence[0] for sentence in test[0:1000]]
actual_german = [sentence[1] for sentence in test[0:1000]]

pred_df = pd.DataFrame({'english': initial_sentences[0:21], 'predicted german': preds_text[0:21], 'actual german': actual_german[0:21]})
pred_df

Unnamed: 0,english,predicted german,actual german
0,where is the bus,wo ist der bus,wo ist der bus
1,tom thinks it impossible for mary to break the...,tom glaubt es maria das zu,tom glaubt dass es maria unmöglich sei den rek...
2,wed better go to another room so they cant hea...,wir sollten wir noch ein nicht was können wir...,wir gehen besser in ein anderes zimmer damit n...
3,hide in the closet,froh sie den,versteck dich im schrank
4,tom asked mary to tell him about the house she...,sie ihm einem sie ein,tom bat mary ihm von dem haus zu erzählen in d...
5,tom is my flesh and blood,tom ist mein und,tom ist mein fleisch und blut
6,you are what you eat,du bist was du isst,ihr seid was ihr esst
7,shes a single mother of two,sie ist eine ihrer frau,sie ist eine alleinerziehende mutter zweier ki...
8,i am losing my patience with you,ich verliere die geduld mit geduld,ich verliere die geduld mit ihnen
9,this meeting room is small,dieses diesem ist in klein,das sitzungszimmer ist klein


In [25]:
from numpy.core.fromnumeric import size
from nltk.translate.bleu_score import corpus_bleu

# Split sentences into tokens
actual_german_tokens = [[sent.split()] for sent in actual_german]
preds_text_tokens = [sent.split() for sent in preds_text]

# Calculate BLEU scores
bleu_score = corpus_bleu(actual_german_tokens, preds_text_tokens)

print("The BLEU score is: ", bleu_score)

The BLEU score is:  0.21025976732050963


In [37]:
scores = []

for reference, candidate in zip(actual_german_tokens, preds_text_tokens):
    scores.append(meteor_score.single_meteor_score(reference[0], candidate))

average_meteor_score = sum(scores) / len(scores)

print(average_meteor_score)

0.46703515590863304


In [26]:
def predict_german_text(english_text):
    # Preprocess the English text and obtain token sequence
    english_text = clean_data(english_text)
    input_sequence = encode_sequences(en_tokenizer, en_length, [english_text])

    #Multiclassification prediction
    preds = np.argmax(model.predict(input_sequence.reshape((input_sequence.shape[0], input_sequence.shape[1]))), axis=-1)

    preds_text = []
    for i in preds:
        temp = []
        for j in range(len(i)):
            t = get_word(i[j], de_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], de_tokenizer)) or (t == None):
                    temp.append('')
                else:
                    temp.append(t)
            else:
                if(t == None):
                    temp.append('')
                else:
                    temp.append(t)

        preds_text.append(' '.join(temp))

    return preds_text[0]


english_input = input("Enter English Text Here: ", )
predicted_german = predict_german_text(english_input)
print("Predicted German Translation: ", predicted_german)


Enter English Text Here: testing
Predicted German Translation:                
