In [1]:
import os 

import numpy as np
import pandas as pd

from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [2]:
def generate_target(seq:list(), stop_val = 0):
    res = list()
    for i in range(len(seq)):
        temp = list()
        for j in range(1,len(seq[i])):
            temp.append(seq[i][j])
        temp.append(stop_val)
        res.append(temp)
    return res

In [3]:
def find_max_list(list):
    list_len = [len(i) for i in list]
    return (max(list_len))

In [4]:
dataset = pd.read_csv('../data/irr_verb_list.csv')
dataset = dataset.apply(lambda x: '\t' + x +'\n')
#dataset = dataset.apply(lambda x: '\t' + x +'\n', )
dataset.head()

Unnamed: 0,infinitive,past simple,past participle
0,\tabide\n,\tabode\n,\tabode\n
1,\tarise\n,\tarose\n,\tarisen\n
2,\tawake\n,\tawoke\n,\tawoken\n
3,\tbe\n,\twas\n,\tbeen\n
4,\tbear\n,\tbore\n,\tborne\n


In [5]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(dataset['infinitive'].to_list() + dataset['past simple'].to_list() + dataset['past participle'].to_list())

num_classes = len(tokenizer.word_index) + 1

In [6]:
inf_seq = tokenizer.texts_to_sequences(dataset['infinitive'].to_list())
sp_seq = tokenizer.texts_to_sequences(dataset['past simple'].to_list())

t_sp_seq = generate_target(sp_seq)

In [9]:
sp_seq[0]

[1, 10, 18, 6, 9, 3, 2]

In [10]:
t_sp_seq[0]

[10, 18, 6, 9, 3, 2, 0]

In [11]:
num_data = len(dataset)

In [12]:
num_data

186

In [13]:
length_inf = find_max_list(inf_seq)
length_sp = find_max_list(sp_seq)
 
if length_inf > length_sp:
    max_pad = length_inf
else:
    max_pad = length_sp

In [14]:
all_data = inf_seq + sp_seq + t_sp_seq

In [15]:
padded = pad_sequences(all_data, maxlen=max_pad, padding='post')
categorized = to_categorical(padded, num_classes=num_classes)

In [16]:
train = categorized[:num_data]
y_sp = categorized[num_data:2*num_data]
t_y_sp =  categorized[2*num_data:]

In [17]:
num_samples = train.shape[0]
input_dim = train.shape[1]
output_dim = y_sp.shape[1]

num_token = train.shape[-1]
num_classes = train.shape[-1]

start_index = tokenizer.texts_to_sequences(['\t'])[0][0]
stop_index = tokenizer.texts_to_sequences(['\n'])[0][0]

rnn_unit = 32

In [None]:
encoder_inputs = keras.Input(shape=(None, num_token), name='input')

encoder = keras.layers.SimpleRNN(units=rnn_unit, return_state=True, name='encoder')

_, encoder_states = encoder(encoder_inputs)

decoder_inputs = keras.Input(shape=(None, num_token), name='dec_input')

decoder = keras.layers.SimpleRNN(units=rnn_unit, return_state=True, return_sequences=True, name='decoder')
decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

decoder_dense = keras.layers.Dense(num_token, activation='softmax', name='dec_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

In [None]:
batch_size = 128
epochs = 400

In [None]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [train, y_sp],
    t_y_sp,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2
)

model.save('s2s')

In [None]:
model = keras.models.load_model('s2s')

In [None]:
model.summary()

In [None]:
encoder_inputs = model.input[0]
encoder_outputs, encoder_states = model.layers[2].output
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  
decoder_states_inputs = keras.Input(shape=(rnn_unit,))

decoder = model.layers[3]
decoder_outputs, decoder_states = decoder(
    decoder_inputs, initial_state=decoder_states_inputs
)

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs, decoder_states_inputs], [decoder_outputs, decoder_states]
)

In [25]:
input_seq = 'abide'

In [26]:
input_seq = '\t' + input_seq +'\n'

In [33]:
sample_seq = tokenizer.texts_to_sequences([input_seq])

In [34]:
sample_seq

[[1, 10, 18, 13, 9, 3, 2]]

In [29]:

sample_seq = pad_sequences(sample_seq, maxlen=max_pad, padding='post')

In [30]:

sample_seq = to_categorical(sample_seq, num_classes=num_classes)

In [32]:
num_classes

27

In [31]:
(sample_seq == train[0]).all()

True

In [22]:
sample_seq.shape

(1, 12, 27)

In [24]:
train[0].shape

(12, 27)

In [None]:
states_value = encoder_model.predict(sample_seq)

In [None]:
target_seq = np.zeros((1, 1, num_token))
target_seq[0, 0, start_index] = 1.0

is_finished = False
generated_idx = []

while not is_finished:
    dec_out, states_value = decoder_model.predict([target_seq, states_value])
    out_idx = np.argmax(dec_out)
    generated_idx.append(out_idx)

    if out_idx == stop_index or len(generated_idx) > max_pad:
        is_finished=True

    target_seq = np.zeros((1, 1, num_token))
    target_seq[0, 0, out_idx] = 1.0
    

In [None]:
tokenizer.sequences_to_texts([generated_idx])

In [None]:
generated_idx