In [1]:
import os 
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

import numpy as np
import pandas as pd

In [2]:
def generate_target(seq:list(), stop_val = 0):
    res = list()
    for i in range(len(seq)):
        temp = list()
        for j in range(1,len(seq[i])):
            temp.append(seq[i][j])
        temp.append(stop_val)
        res.append(temp)
    return res

In [3]:
def find_max_list(list):
    list_len = [len(i) for i in list]
    return (max(list_len))

In [4]:
dataset = pd.read_csv('irr_verb_list.csv')
dataset = dataset.apply(lambda x: '\t' + x +'\n')
#dataset = dataset.apply(lambda x: '\t' + x +'\n', )
dataset.head()

Unnamed: 0,infinitive,past simple,past participle
0,\tabide\n,\tabode\n,\tabode\n
1,\tarise\n,\tarose\n,\tarisen\n
2,\tawake\n,\tawoke\n,\tawoken\n
3,\tbe\n,\twas\n,\tbeen\n
4,\tbear\n,\tbore\n,\tborne\n


In [5]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(dataset['infinitive'].to_list() + dataset['past simple'].to_list() + dataset['past participle'].to_list())

num_classes = len(tokenizer.word_index) + 1

In [6]:
inf_seq = tokenizer.texts_to_sequences(dataset['infinitive'].to_list())
sp_seq = tokenizer.texts_to_sequences(dataset['past simple'].to_list())

t_sp_seq = generate_target(sp_seq)

In [7]:
num_data = len(dataset)

In [8]:
length_inf = find_max_list(inf_seq)
length_sp = find_max_list(sp_seq)
 
if length_inf > length_sp:
    max_pad = length_inf
else:
    max_pad = length_sp

In [9]:
all_data = inf_seq + sp_seq + t_sp_seq

In [10]:
padded = pad_sequences(all_data, maxlen=max_pad, padding='post')
categorized = to_categorical(padded, num_classes=num_classes)

In [11]:
train = categorized[:num_data]
y_sp = categorized[num_data:2*num_data]
t_y_sp =  categorized[2*num_data:]

In [95]:
num_samples = train.shape[0]
input_dim = train.shape[1]
output_dim = y_sp.shape[1]

num_token = train.shape[-1]
num_classes = train.shape[-1]

start_index = tokenizer.texts_to_sequences(['\t'])[0][0]
stop_index = tokenizer.texts_to_sequences(['\n'])[0][0]

rnn_unit = 32

In [105]:
encoder_inputs = keras.Input(shape=(None, num_token), name='input')

encoder = keras.layers.SimpleRNN(units=rnn_unit, return_state=True, name='encoder')

_, encoder_states = encoder(encoder_inputs)

decoder_inputs = keras.Input(shape=(None, num_token), name='dec_input')

decoder = keras.layers.SimpleRNN(units=rnn_unit, return_state=True, return_sequences=True, name='decoder')
decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

decoder_dense = keras.layers.Dense(num_token, activation='softmax', name='dec_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [106]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, None, 27)]   0           []                               
                                                                                                  
 dec_input (InputLayer)         [(None, None, 27)]   0           []                               
                                                                                                  
 encoder (SimpleRNN)            [(None, 32),         1920        ['input[0][0]']                  
                                 (None, 32)]                                                      
                                                                                                  
 decoder (SimpleRNN)            [(None, None, 32),   1920        ['dec_input[0][0]',       

In [107]:
batch_size = 128
epochs = 400

In [108]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [train, y_sp],
    t_y_sp,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2
)

model.save('s2s')

Epoch 1/400
2/2 - 1s - loss: 3.2632 - accuracy: 0.0524 - 1s/epoch - 603ms/step
Epoch 2/400
2/2 - 0s - loss: 3.0435 - accuracy: 0.1980 - 114ms/epoch - 57ms/step
Epoch 3/400
2/2 - 0s - loss: 2.8737 - accuracy: 0.3710 - 117ms/epoch - 58ms/step
Epoch 4/400
2/2 - 0s - loss: 2.7186 - accuracy: 0.5063 - 113ms/epoch - 56ms/step
Epoch 5/400
2/2 - 0s - loss: 2.5751 - accuracy: 0.5685 - 112ms/epoch - 56ms/step
Epoch 6/400
2/2 - 0s - loss: 2.4411 - accuracy: 0.5793 - 108ms/epoch - 54ms/step
Epoch 7/400
2/2 - 0s - loss: 2.3158 - accuracy: 0.5775 - 104ms/epoch - 52ms/step
Epoch 8/400
2/2 - 0s - loss: 2.1996 - accuracy: 0.5739 - 101ms/epoch - 51ms/step
Epoch 9/400
2/2 - 0s - loss: 2.0942 - accuracy: 0.5717 - 116ms/epoch - 58ms/step
Epoch 10/400
2/2 - 0s - loss: 1.9994 - accuracy: 0.5730 - 107ms/epoch - 54ms/step
Epoch 11/400
2/2 - 0s - loss: 1.9152 - accuracy: 0.5748 - 115ms/epoch - 57ms/step
Epoch 12/400
2/2 - 0s - loss: 1.8403 - accuracy: 0.5744 - 135ms/epoch - 68ms/step
Epoch 13/400
2/2 - 0s - los

In [109]:
model = keras.models.load_model('s2s')

In [110]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, None, 27)]   0           []                               
                                                                                                  
 dec_input (InputLayer)         [(None, None, 27)]   0           []                               
                                                                                                  
 encoder (SimpleRNN)            [(None, 32),         1920        ['input[0][0]']                  
                                 (None, 32)]                                                      
                                                                                                  
 decoder (SimpleRNN)            [(None, None, 32),   1920        ['dec_input[0][0]',       

In [111]:
encoder_inputs = model.input[0]
encoder_outputs, encoder_states = model.layers[2].output
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  
decoder_states_inputs = keras.Input(shape=(rnn_unit,))

decoder = model.layers[3]
decoder_outputs, decoder_states = decoder(
    decoder_inputs, initial_state=decoder_states_inputs
)

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs, decoder_states_inputs], [decoder_outputs, decoder_states]
)

In [227]:
input_seq = 'drip'

In [228]:
input_seq = '\t' + input_seq +'\n'
sample_seq = tokenizer.texts_to_sequences([input_seq])
sample_seq = pad_sequences(sample_seq, maxlen=max_pad, padding='post')
sample_seq = to_categorical(sample_seq, num_classes=num_classes)

In [229]:
(sample_seq == train[0]).all()

False

In [230]:
states_value = encoder_model.predict(sample_seq)



In [231]:
target_seq = np.zeros((1, 1, num_token))
target_seq[0, 0, start_index] = 1.0

is_finished = False
generated_idx = []

while not is_finished:
    dec_out, states_value = decoder_model.predict([target_seq, states_value])
    out_idx = np.argmax(dec_out)
    generated_idx.append(out_idx)

    if out_idx == stop_index or len(generated_idx) > max_pad:
        is_finished=True

    target_seq = np.zeros((1, 1, num_token))
    target_seq[0, 0, out_idx] = 1.0
    



In [232]:
tokenizer.sequences_to_texts([generated_idx])

['d r o v e \n']

In [167]:
generated_idx

[11, 13, 9, 2]