In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, RepeatVector
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import random
from keras.models import Sequential
from numpy.core.fromnumeric import size
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate import meteor_score
nltk.download('wordnet')
from numpy.core.fromnumeric import size
from nltk.translate.bleu_score import corpus_bleu

[nltk_data] Downloading package wordnet to /root/nltk_data...


Mount google drive to get access to the data files and the model

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load in the pre-trained model

In [3]:
model_path = "/content/drive/MyDrive/modelFinal/trained_model.h5"
model = tf.keras.models.load_model(model_path)

Clean the data

In [4]:
def clean_data(text):
    text = text.replace('\n', ' ')  # remove newline
    text = text.replace('/', ' ')  # remove forward slashes
    text = re.sub(r'\s+', ' ', text)  # replace multiple whitespace with a single space
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß ]', '', text)  # remove non-alphanumeric characters
    text = text.lower()

    return text

Loading in the datasets and converting them to lists. We take a sample of the test set, because when running the whole test set, our notebook crashes because it uses too much RAM.





In [5]:
df_train = pd.read_csv("/content/drive/MyDrive/df_train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/df_test.csv")
df_val = pd.read_csv("/content/drive/MyDrive/df_val.csv")

df_test = df_test.sample(1000)

train = df_train.values.tolist()
test = df_test.values.tolist()
val = df_val.values.tolist()


Function to build a tokenizer

In [6]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

Tokenizing the English sentences

In [7]:
en_sentences = []
for sentence in train:
    en_sentences.append(sentence[0])
en_tokenizer = tokenization(en_sentences)
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = 15 #Taken from the Histogram
print('English Vocab: %d' % en_vocab_size)

English Vocab: 14341


Tokenizing the German sentences

In [8]:
de_sentences = []
for sentence in train:
    de_sentences.append(sentence[1])
de_tokenizer = tokenization(de_sentences)
de_vocab_size = len(de_tokenizer.word_index) + 1
de_length = 15 #Hisogram
print('German Vocab: %d' % de_vocab_size)

German Vocab: 28493


Function that zero pads the sentences based on tokenizer for training

In [9]:
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

The source sequences, or our English sentences will be encoded as X. While our target German sequences will be encoded as Y

In [10]:
trainX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in train])
trainY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in train])

testX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in test])
testY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in test])

valX = encode_sequences(en_tokenizer, en_length, [sentence[0] for sentence in val])
valY = encode_sequences(de_tokenizer, de_length, [sentence[1] for sentence in val])

Functions that returns word assigned to index in tokenizer

In [11]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

Running the model to get the predicted translations

In [12]:
preds = np.argmax(model.predict(testX.reshape((testX.shape[0],testX.shape[1]))), axis=-1)

preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], de_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], de_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)

    preds_text.append(' '.join(temp))




Print the English sentences, our predicted German translation, and the actual German translation

In [13]:
initial_sentences = [sentence[0] for sentence in test]
actual_german = [sentence[1] for sentence in test]

pred_df = pd.DataFrame({'english': initial_sentences[0:20], 'predicted german': preds_text[0:20], 'actual german': actual_german[0:20]})
pred_df

Unnamed: 0,english,predicted german,actual german
0,preventive measures are much more effective th...,die sind die,gegenmaßnahmen sind um einiges effektiver als ...
1,they are crazy about jazz,sie sind verrückt nach verrückt,sie sind verrückt nach jazz
2,they sat on a bench in the park,sie saßen auf einer bank,sie saßen auf einer bank im park
3,it wasnt that serious,das war das nicht ernst,so ernst war das nicht gemeint
4,there are a lot of eggs in the box,es der eier in der schachtel,in der schachtel sind viele eier
5,i met a dog on my way home,ich habe meinen hund,ich habe auf meinem heimweg einen hund getroffen
6,who is their homeroom teacher,wer ist ihre,wer ist ihr klassenlehrer
7,there may be some truth to this,es kann etwas zu,es könnte etwas wahres daran sein
8,she is a wellknown singer,sie ist eine sängerin,sie ist eine sehr bekannte sängerin
9,theyre crazy about each other,sie sind verrückt über anderen,sie sind verrückt nacheinander


Split the sentences into tokens and calculate the BLEU score

In [14]:
actual_german_tokens = [[sent.split()] for sent in actual_german]
preds_text_tokens = [sent.split() for sent in preds_text]

bleu_score = corpus_bleu(actual_german_tokens, preds_text_tokens)

print("The BLEU score is: ", bleu_score)

The BLEU score is:  0.212969642634469


Calculate the METEOR score

In [15]:
scores = []

for reference, candidate in zip(actual_german_tokens, preds_text_tokens):
    scores.append(meteor_score.single_meteor_score(reference[0], candidate))

average_meteor_score = sum(scores) / len(scores)

print(average_meteor_score)

0.456145166784418


Function that lets you use the model for live translation

In [16]:
def predict_german_text(english_text):
    # Preprocess the English text and obtain token sequence
    english_text = clean_data(english_text)
    input_sequence = encode_sequences(en_tokenizer, en_length, [english_text])

    #Multiclassification prediction
    preds = np.argmax(model.predict(input_sequence.reshape((input_sequence.shape[0], input_sequence.shape[1]))), axis=-1)

    preds_text = []
    for i in preds:
        temp = []
        for j in range(len(i)):
            t = get_word(i[j], de_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], de_tokenizer)) or (t == None):
                    temp.append('')
                else:
                    temp.append(t)
            else:
                if(t == None):
                    temp.append('')
                else:
                    temp.append(t)

        preds_text.append(' '.join(temp))

    return preds_text[0]


english_input = input("Enter English Text Here: ", )
predicted_german = predict_german_text(english_input)
print("Predicted German Translation: ", predicted_german)


Enter English Text Here: I like people
Predicted German Translation:  ich mag die leute           
