In [17]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras_preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
def read_text(filename):
        file = open(filename, mode='rt', encoding='utf-8')
        
        text = file.read().strip().split('\n')
        text = [i.split('\t') for i in text]
        file.close()
        return text

In [20]:
translations = read_text("/content/drive/MyDrive/training/rus.txt")
translations = array(translations)

translations = translations[:50000,:] # translations

translations[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in translations[:,0]]
translations[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in translations[:,1]]

translations

array([['Go', 'Марш',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1159202 (shanghainese)'],
       ['Go', 'Иди',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898247 (marafon)'],
       ['Go', 'Идите',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898250 (marafon)'],
       ...,
       ['Can I open my eyes', 'Можно открывать глаза',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #4570971 (Hybrid) & #5539729 (marafon)'],
       ['Can I open my eyes', 'Я могу открыть глаза',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #4570971 (Hybrid) & #5539730 (marafon)'],
       ['Can I say it aloud', 'Можно я скажу это вслух',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #3730370 (CM) & #4132180 (marafon)']],
      dtype='<U537')

In [21]:
for i in range(len(translations)):
    translations[i,0] = translations[i,0].lower()
    translations[i,1] = translations[i,1].lower()

In [22]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

en_tokenizer = tokenization(translations[:, 0])
en_vocab_size = len(en_tokenizer.word_index) + 1

en_len = 8

ru_tokenizer = tokenization(translations[:, 1])
ru_vocab_size = len(ru_tokenizer.word_index) + 1

ru_len = 8

In [10]:
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [24]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(translations, test_size=0.2, random_state = 12)

trainX = encode_sequences(ru_tokenizer, ru_len, train[:, 1])
trainY = encode_sequences(en_tokenizer, en_len, train[:, 0])

testX = encode_sequences(ru_tokenizer, ru_len, test[:, 1])
testY = encode_sequences(en_tokenizer, en_len, test[:, 0])

In [63]:
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

model = define_model(ru_vocab_size, en_vocab_size, ru_len, en_len, 512)

rms = optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [64]:
filename = 'model'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=16, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

Epoch 1/16
Epoch 1: val_loss improved from inf to 2.56352, saving model to model




Epoch 2/16
Epoch 2: val_loss improved from 2.56352 to 2.40105, saving model to model




Epoch 3/16
Epoch 3: val_loss improved from 2.40105 to 2.23683, saving model to model




Epoch 4/16
Epoch 4: val_loss improved from 2.23683 to 2.08561, saving model to model




Epoch 5/16
Epoch 5: val_loss improved from 2.08561 to 1.95703, saving model to model




Epoch 6/16
Epoch 6: val_loss improved from 1.95703 to 1.86727, saving model to model




Epoch 7/16
Epoch 7: val_loss improved from 1.86727 to 1.78641, saving model to model




Epoch 8/16
Epoch 8: val_loss improved from 1.78641 to 1.72139, saving model to model




Epoch 9/16
Epoch 9: val_loss improved from 1.72139 to 1.62271, saving model to model




Epoch 10/16
Epoch 10: val_loss improved from 1.62271 to 1.57468, saving model to model




Epoch 11/16
Epoch 11: val_loss improved from 1.57468 to 1.52592, saving model to model




Epoch 12/16
Epoch 12: val_loss improved from 1.52592 to 1.46832, saving model to model




Epoch 13/16
Epoch 13: val_loss improved from 1.46832 to 1.42751, saving model to model




Epoch 14/16
Epoch 14: val_loss improved from 1.42751 to 1.38395, saving model to model




Epoch 15/16
Epoch 15: val_loss improved from 1.38395 to 1.35550, saving model to model




Epoch 16/16
Epoch 16: val_loss improved from 1.35550 to 1.30591, saving model to model






In [12]:
model = load_model('model')
preds = argmax(model.predict(testX.reshape((testX.shape[0],testX.shape[1]))), axis=-1)



In [13]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [25]:
predicts = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], en_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], en_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t) 

    predicts.append(' '.join(temp))

In [26]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : predicts})
pred_df.head(15)

Unnamed: 0,actual,predicted
0,tom hated us all,tom miss us
1,i groaned,i swore
2,mary is mine,thats is weird
3,im not hurt,i didnt go
4,please stop there,please stop there
5,i loved the music,i need an
6,tom gave it to us,tom gave to us
7,im following you,ill go with you
8,thats not tom,its isnt tom
9,it has to be done,it must be
