In [79]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline

In [80]:
def read_text(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [81]:
def split_text(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [82]:
data = read_text("data/non-processed/dataset7/slv.txt")
slo_eng = split_text(data)
slo_eng = array(slo_eng)

In [83]:
slo_eng = slo_eng[:299,:]

In [84]:
# convert to lowercase
for i in range(len(slo_eng)):
    slo_eng[i,0] = slo_eng[i,0].lower()
    
    slo_eng[i,1] = slo_eng[i,1].lower()

In [85]:
eng_l = []
slo_l = []

for i in slo_eng[:,0]:
    eng_l.append(len(i.split()))

for i in slo_eng[:,1]:
    slo_l.append(len(i.split()))

In [86]:
# build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [87]:
# english tokenizer
eng_tokenizer = tokenization(slo_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 17
print('English Vocabulary Size: %d' % eng_vocab_size)

English Vocabulary Size: 570


In [88]:
# slovenian tokenizer
slo_tokenizer = tokenization(slo_eng[:, 1])
slo_vocab_size = len(slo_tokenizer.word_index) + 1

slo_length = 15
print('Slo Vocabulary Size: %d' % slo_vocab_size)

Slo Vocabulary Size: 689


In [89]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [90]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(slo_eng, test_size=0.2, random_state = 12)

In [91]:
# prepare training data
trainX = encode_sequences(slo_tokenizer, slo_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

In [92]:
# prepare test data
testX = encode_sequences(slo_tokenizer, slo_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [93]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import RMSprop
# build neural machine translation model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(in_vocab, units, input_length=in_timesteps))
    model.add(tf.keras.layers.LSTM(units))
    model.add(tf.keras.layers.RepeatVector(out_timesteps))
    model.add(tf.keras.layers.LSTM(units, return_sequences=True))
    model.add(tf.keras.layers.Dense(out_vocab, activation='softmax'))
    return model

In [94]:
model = build_model(slo_vocab_size, eng_vocab_size, slo_length, eng_length, 512)

In [95]:
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [96]:
filename = 'model.trans_test'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=20, batch_size=512, 
          validation_split = 0.2,
          callbacks=[checkpoint], verbose=1)

Epoch 1/20

Epoch 00001: val_loss improved from inf to 4.99732, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 2/20

Epoch 00002: val_loss improved from 4.99732 to 4.18519, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 3/20

Epoch 00003: val_loss improved from 4.18519 to 2.70890, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 4/20

Epoch 00004: val_loss improved from 2.70890 to 2.65548, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 5/20

Epoch 00005: val_loss improved from 2.65548 to 2.61265, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 6/20

Epoch 00006: val_loss did not improve from 2.61265
Epoch 7/20

Epoch 00007: val_loss did not improve from 2.61265
Epoch 8/20

Epoch 00008: val_loss did not improve from 2.61265
Epoch 9/20

Epoch 00009: val_loss improved from 2.61265 to 2.55997, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 10/20

Epoch 00010: val_loss improved from 2.55997 to 2.55172, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 11/20

Epoch 00011: val_loss improved from 2.55172 to 2.53125, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 12/20

Epoch 00012: val_loss improved from 2.53125 to 2.52802, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 13/20

Epoch 00013: val_loss improved from 2.52802 to 2.50515, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 14/20

Epoch 00014: val_loss did not improve from 2.50515
Epoch 15/20

Epoch 00015: val_loss improved from 2.50515 to 2.48625, saving model to model.trans_test




INFO:tensorflow:Assets written to: model.trans_test\assets


INFO:tensorflow:Assets written to: model.trans_test\assets


Epoch 16/20

Epoch 00016: val_loss did not improve from 2.48625
Epoch 17/20

Epoch 00017: val_loss did not improve from 2.48625
Epoch 18/20

Epoch 00018: val_loss did not improve from 2.48625
Epoch 19/20

Epoch 00019: val_loss did not improve from 2.48625
Epoch 20/20

Epoch 00020: val_loss did not improve from 2.48625
