In [4]:
import sqlite3
import json
import os
# Paths to files
DATSET_PATH = "../data/processed/generated_sql.json"

DATABASE_DIR = "../dataset/dev_databases/"  # Directory where your database subfolders are stored

ORIGINAL_OUTPUT_PATH = "dataset_query_results.json"  # Path to save the query results
GENERATED_OUTPUT_PATH = "test_query_results.json"

# Number of items to process (set to None to process all items, or an integer to limit)
TEST_LIMIT = None  # Set this to None to process all items, or any integer to limit to first N items
def create_sql_queries(dataset_path:str=None,original_output_file_path: str=None, generated_output_file_path: str=None):
    # Read the JSON file containing the SQL queries
    with open(dataset_path, 'r') as json_file:
        data = json.load(json_file)

    # Limit the number of items if test_limit is set
    if TEST_LIMIT is not None:
        data = data[:TEST_LIMIT]

    # Initialize an empty list to store the query results
    original_query_results = []
    generated_query_results = []
    # Loop through each item in the data
    for idx, item in enumerate(data):
        db_id = item['db_id']  # Get the database ID (db folder name)
        original_query = item['Original SQL']    # Get the SQL query
        generated_query = item['Generated SQL']    # Get the SQL query
        question_id = item['question_id']  # Get the question ID


        # Construct the path to the database file
        db_path = os.path.join(DATABASE_DIR, db_id, f"{db_id}.sqlite")

        # Check if the database file exists
        if not os.path.exists(db_path):
            print(f"Database file '{db_path}' not found for question {question_id}")
            original_query_results.append([])
            generated_query_results.append([])
        else:
            # Connect to the SQLite database
            conn = sqlite3.connect(db_path)
            cursor = conn.cursor()

            try:
                # Execute the SQL query
                cursor.execute(original_query)
                # Fetch the results
                original_rows = cursor.fetchall()
                # Flatten the result to a single list (flat array)
                original_query_result = [item for row in original_rows for item in row]  # Flatten the list of tuples
                print(f"Original Query executed successfully for question {question_id}.")

                # Append the flat result to the list of all results
                original_query_results.append([original_query_result])

            except Exception as e:
                # Handle any errors that occur during query execution
                print(f"Error executing Original query for question {question_id}: {e}")
                original_query_results.append([])  # Append an empty list in case of error

            try:
                cursor.execute(generated_query)
                generated_rows = cursor.fetchall()
                # Flatten the result to a single list (flat array)
                generated_query_result = [item for row in generated_rows for item in row]  # Flatten the list of tuples
                print(f"Generated Query executed successfully for question {question_id}.")

                # Append the flat result to the list of all results
                generated_query_results.append([generated_query_result])

            except Exception as e:
                # Handle any errors that occur during query execution
                print(f"Error executing Generated query for question {question_id}: {e}")
                generated_query_results.append([])  # Append an empty list in case of error

            finally:
                # Close the database connection
                conn.close()

    # Save all query results (array of arrays) to a JSON file
    with open(original_output_file_path, 'w') as outfile:
        json.dump(original_query_results, outfile, indent=4)
    with open(generated_output_file_path, 'w') as outfile:
        json.dump(generated_query_results, outfile, indent=4)

    print(f"All results saved to {original_output_file_path} and {generated_output_file_path}!")

In [5]:
create_sql_queries(dataset_path=DATSET_PATH, original_output_file_path=ORIGINAL_OUTPUT_PATH, generated_output_file_path=GENERATED_OUTPUT_PATH)

Original Query executed successfully for question 205.
Generated Query executed successfully for question 205.
Original Query executed successfully for question 52.
Generated Query executed successfully for question 52.
Original Query executed successfully for question 1374.
Error executing Generated query for question 1374: no such column: T2.phone
Original Query executed successfully for question 1194.
Generated Query executed successfully for question 1194.
Original Query executed successfully for question 421.
Generated Query executed successfully for question 421.
Original Query executed successfully for question 699.
Error executing Generated query for question 699: no such table: accounts
Original Query executed successfully for question 939.
Error executing Generated query for question 939: no such column: T1.driverStandingsId
Original Query executed successfully for question 919.
Generated Query executed successfully for question 919.
Original Query executed successfully for q

In [6]:
# Function to compare two arrays entry by entry
def compare_arrays(array_path1=None, array_path2=None):
    with open(array_path1) as f:
        array1 = json.load(f)

    with open(array_path2) as f:
        array2 = json.load(f)

    if len(array1) != len(array2):
        raise ValueError("Both arrays should have the same number of entries")

    total_entries = len(array1)
    matching_entries = 0

    for entry1, entry2 in zip(array1, array2):
        if entry1 == entry2:
            matching_entries += 1

    return matching_entries, total_entries

path1="dataset_query_results.json"
path2="test_query_results.json"
# Compare the two arrays and print the result
matching, total = compare_arrays(path1, path2)

print(f"Matching entries: {matching}/{total}")

Matching entries: 79/307
