In [None]:
import json

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    input_texts, target_texts = [], []
    for item in data:
        utterances = item['utterances']
        input_text = utterances[0].split(": ")[1]
        target_text = utterances[1].split(": ")[1]
        input_texts.append(input_text)
        target_texts.append(target_text)
    
    return input_texts, target_texts

train_inputs, train_targets = load_data('english-train.json')
dev_inputs, dev_targets = load_data('english-dev.json')
test_inputs, test_targets = load_data('english-test.json')


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

def tokenize_data(input_texts, target_texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(input_texts + target_texts)
    input_sequences = tokenizer.texts_to_sequences(input_texts)
    target_sequences = tokenizer.texts_to_sequences(target_texts)

    max_input_length = max(len(seq) for seq in input_sequences)
    max_target_length = max(len(seq) for seq in target_sequences)

    input_sequences = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')
    target_sequences = pad_sequences(target_sequences, maxlen=max_target_length, padding='post')
    
    return tokenizer, input_sequences, target_sequences, max_input_length, max_target_length

tokenizer, train_input_sequences, train_target_sequences, max_input_length, max_target_length = tokenize_data(train_inputs, train_targets)

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

def build_model(input_length, target_length, vocab_size, embedding_dim=256, latent_dim=256):
    # Encoder
    encoder_inputs = Input(shape=(input_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(target_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

model = build_model(max_input_length, max_target_length, len(tokenizer.word_index) + 1)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')


In [None]:
import numpy as np
from keras.utils import to_categorical

def to_one_hot(sequences, num_classes):
    one_hot_targets = np.zeros((len(sequences), sequences.shape[1], num_classes))
    for i, sequence in enumerate(sequences):
        one_hot_targets[i] = to_categorical(sequence, num_classes=num_classes)
    return one_hot_targets

train_target_one_hot = to_one_hot(train_target_sequences, len(tokenizer.word_index) + 1)
model.fit([train_input_sequences, train_target_sequences], train_target_one_hot, epochs=20, batch_size=64)
