In [64]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

queries = [
    'SELECT * FROM users WHERE age > 18;',
    'SELECT name, email FROM customers WHERE country = "USA";',
    'SELECT * FROM products WHERE price > 100;',
    'SELECT city, population FROM cities WHERE country = "Canada";',
    'SELECT * FROM orders WHERE date >= "2022-01-01";',
]

optimized_queries = [
    'SELECT * FROM users WHERE age > 18 ORDER BY name ASC;',
    'SELECT name, email FROM customers WHERE country = "USA" ORDER BY name;',
    'SELECT * FROM products WHERE price > 100 ORDER BY price DESC;',
    'SELECT city, population FROM cities WHERE country = "Canada" ORDER BY population DESC;',
    'SELECT * FROM orders WHERE date >= "2022-01-01" ORDER BY date ASC;',
]

all_texts = queries + optimized_queries

# Tokenization
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(all_texts)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Tokenize queries and optimized_queries
tokenized_queries = tokenizer.texts_to_sequences(queries)
tokenized_optimized_queries = tokenizer.texts_to_sequences(optimized_queries)

max_len = max(max(len(seq) for seq in tokenized_queries), max(len(seq) for seq in tokenized_optimized_queries))
padded_queries = pad_sequences(tokenized_queries, padding='post', maxlen=max_len)
padded_optimized_queries = pad_sequences(tokenized_optimized_queries, padding='post', maxlen=max_len)

input_data = tf.convert_to_tensor(padded_queries)
output_data = tf.convert_to_tensor(padded_optimized_queries)

print("Tokenized Queries:\n", tokenized_queries)
print("\nTokenized Optimized Queries:\n", tokenized_optimized_queries)
print("\nPadded Queries:\n", padded_queries)
print("\nPadded Optimized Queries:\n", padded_optimized_queries)
print("\nVocabulary Size:", vocab_size)
print("\nMax Sequence Length:", max_len)


Tokenized Queries:
 [[1, 4, 2, 13, 3, 14, 7, 25], [1, 15, 16, 2, 17, 3, 8, 9, 26], [1, 4, 2, 18, 3, 10, 7, 27], [1, 19, 11, 2, 20, 3, 8, 9, 28], [1, 4, 2, 21, 3, 12, 22, 29]]

Tokenized Optimized Queries:
 [[1, 4, 2, 13, 3, 14, 7, 30, 5, 6, 31, 23], [1, 15, 16, 2, 17, 3, 8, 9, 32, 5, 6, 33], [1, 4, 2, 18, 3, 10, 7, 34, 5, 6, 10, 24], [1, 19, 11, 2, 20, 3, 8, 9, 35, 5, 6, 11, 24], [1, 4, 2, 21, 3, 12, 22, 36, 5, 6, 12, 23]]

Padded Queries:
 [[ 1  4  2 13  3 14  7 25  0  0  0  0  0]
 [ 1 15 16  2 17  3  8  9 26  0  0  0  0]
 [ 1  4  2 18  3 10  7 27  0  0  0  0  0]
 [ 1 19 11  2 20  3  8  9 28  0  0  0  0]
 [ 1  4  2 21  3 12 22 29  0  0  0  0  0]]

Padded Optimized Queries:
 [[ 1  4  2 13  3 14  7 30  5  6 31 23  0]
 [ 1 15 16  2 17  3  8  9 32  5  6 33  0]
 [ 1  4  2 18  3 10  7 34  5  6 10 24  0]
 [ 1 19 11  2 20  3  8  9 35  5  6 11 24]
 [ 1  4  2 21  3 12 22 36  5  6 12 23  0]]

Vocabulary Size: 37

Max Sequence Length: 13


In [68]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Define the model architecture
def build_sequence_to_sequence_model(vocab_size, max_len):
    # Encoder
    encoder_inputs = Input(shape=(max_len,))
    encoder_embedding = Embedding(vocab_size, 64, input_length=max_len, mask_zero=True)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(64, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(max_len,))
    decoder_embedding = Embedding(vocab_size, 64, input_length=max_len, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

model = build_sequence_to_sequence_model(vocab_size, max_len)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


input_train_np, input_test_np, output_train_np, output_test_np = train_test_split(
    input_data.numpy(), output_data.numpy(), test_size=0.2, random_state=42
)

# Convert NumPy arrays back to TensorFlow tensors
input_train = tf.convert_to_tensor(input_train_np)
input_test = tf.convert_to_tensor(input_test_np)
output_train = tf.convert_to_tensor(output_train_np)
output_test = tf.convert_to_tensor(output_test_np)

# Train the model
model.fit([input_train, input_train], output_train, epochs=50, batch_size=1, validation_data=([input_test, input_test], output_test))

evaluation_result = model.evaluate([input_test, input_test], output_test, batch_size=1)

print("Evaluation Loss:", evaluation_result[0])
print("Evaluation Accuracy:", evaluation_result[1])

predictions = model.predict([input_test, input_test], batch_size=1)

for i in range(min(3, len(predictions))):
    print("\nExample", i + 1)
    print("Predicted Sequence:", predictions[i])
    print("Actual Output:", output_test[i])


Model: "model_28"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_57 (InputLayer)       [(None, 13)]                 0         []                            
                                                                                                  
 input_58 (InputLayer)       [(None, 13)]                 0         []                            
                                                                                                  
 embedding_56 (Embedding)    (None, 13, 64)               2368      ['input_57[0][0]']            
                                                                                                  
 embedding_57 (Embedding)    (None, 13, 64)               2368      ['input_58[0][0]']            
                                                                                           

In [72]:
import numpy as np

def predict_optimized_query(new_query, tokenizer, model, max_len):
    tokenized_new_query = tokenizer.texts_to_sequences([new_query])
    padded_new_query = pad_sequences(tokenized_new_query, padding='post', maxlen=max_len)

    prediction = model.predict([padded_new_query, padded_new_query], batch_size=1)

    predicted_sequence = np.argmax(prediction, axis=-1)[0]
    predicted_query = tokenizer.sequences_to_texts([predicted_sequence])[0]

    return predicted_query

new_query = 'SELECT age, name FROM users WHERE country = "France";'
predicted_optimized_query = predict_optimized_query(new_query, tokenizer, model, max_len)

print("Input Query:", new_query)
print("Predicted Optimized Query:", predicted_optimized_query)


Input Query: SELECT age, name FROM users WHERE country = "France";
Predicted Optimized Query: select * from where where country "canada" "canada" "canada" "canada" "canada" "canada" "canada"
