## join_predicates

### join_predicates without table alias

In [None]:
import json
import os
import re
from typing import Dict, List
from collections import defaultdict

def load_table_dict(json_file: str = '0_table_dict.json') -> Dict:
    with open(json_file, 'r') as f:
        return json.load(f)

def process_predicates(query_id: str):
    table_dict = load_table_dict()
    
    predicate_file = f"raw_predicate_tpl/query_condition_{query_id}.tpl"
    with open(predicate_file, 'r') as f:
        predicate_content = f.read()
    
    # read train & test
    training_file = f"dsb_{query_id}_original/cardinality/inputs/training/{query_id}_training_original_50.json"
    testing_file = f"dsb_{query_id}_original/cardinality/inputs/testing/{query_id}_testing_original.json"
    
    with open(training_file, 'r') as f:
        training_data = json.load(f)
        training_params = training_data[query_id]["params"]
    
    with open(testing_file, 'r') as f:
        testing_data = json.load(f)
        testing_params = testing_data[query_id]["params"]
    
    process_mode(query_id, predicate_content, training_params, table_dict, "50_training")
    process_mode(query_id, predicate_content, testing_params, table_dict, "200_testing")


def process_mode(query_id: str, predicate_content: str, params: List[List], table_dict: Dict, mode: str):
    table_conditions = defaultdict(list)
    
    for param_set in params:
        current_instance_conditions = defaultdict(list)
        current_content = predicate_content
        
        # substitute
        for i, param in enumerate(param_set):
            pattern = re.compile(rf"@param{i}\b")
            current_content = pattern.sub(str(param).strip(), current_content)
        
        # process conditions
        for line in current_content.strip().split('\n'):
            if line.strip():
                table_key = line.strip().split('_')[0]
                table_name = table_dict.get(table_key)
                if table_name:
                    current_instance_conditions[table_name].append(line.strip())
        
        # for each table
        for table_name, conditions in current_instance_conditions.items():
            instance_str = '["' + '", "'.join(conditions) + '"]'
            table_conditions[table_name].append(instance_str)
    
    # output
    output_dir = f"dsb_{query_id}_original/cardinality/inputs/PQO/join_predicates"
    os.makedirs(output_dir, exist_ok=True)
    
    for table_name, instance_arrays in table_conditions.items():
        output_file = f"{output_dir}/{query_id}_{mode}_{table_name}.txt"
        with open(output_file, 'w') as f:
            f.write(',\n'.join(instance_arrays))

def main():
    query_ids = ['084', '091', '099']
    
    for query_id in query_ids:
        try:
            process_predicates(query_id)
            print(f"Successfully processed query {query_id}")
        except Exception as e:
            print(f"Error processing query {query_id}: {str(e)}")

if __name__ == "__main__":
    # main()

Successfully processed query 084
Successfully processed query 091
Successfully processed query 099


In [None]:
# for mixture
import json
import os
import re
from typing import Dict, List
from collections import defaultdict

def load_table_dict(json_file: str = '0_table_dict.json') -> Dict:
    with open(json_file, 'r') as f:
        return json.load(f)

def process_predicates(query_id: str):
    table_dict = load_table_dict()
    
    predicate_file = f"raw_predicate_tpl/query_condition_{query_id}.tpl"
    with open(predicate_file, 'r') as f:
        predicate_content = f.read()
    
    # read train & test
    testing_file = f"0_mixture_test/{query_id}/{query_id}_mixture_test.json"
    
    with open(testing_file, 'r') as f:
        testing_data = json.load(f)
        testing_params = testing_data[query_id]["params"]
    
    process_mode(query_id, predicate_content, testing_params, table_dict, "200_testing")


def process_mode(query_id: str, predicate_content: str, params: List[List], table_dict: Dict, mode: str):
    table_conditions = defaultdict(list)
    
    for param_set in params:
        current_instance_conditions = defaultdict(list)
        current_content = predicate_content
        
        # substitute
        for i, param in enumerate(param_set):
            pattern = re.compile(rf"@param{i}\b")
            current_content = pattern.sub(str(param).strip(), current_content)
        
        # process conditions
        for line in current_content.strip().split('\n'):
            if line.strip():
                table_key = line.strip().split('_')[0]
                table_name = table_dict.get(table_key)
                if table_name:
                    current_instance_conditions[table_name].append(line.strip())
        
        # for each table
        for table_name, conditions in current_instance_conditions.items():
            instance_str = '["' + '", "'.join(conditions) + '"]'
            table_conditions[table_name].append(instance_str)
    
    # output
    output_dir = f"0_mixture_test/{query_id}/PQO"
    os.makedirs(output_dir, exist_ok=True)
    
    for table_name, instance_arrays in table_conditions.items():
        output_file = f"{output_dir}/{query_id}_{mode}_{table_name}.txt"
        with open(output_file, 'w') as f:
            f.write(',\n'.join(instance_arrays))

def main():
    query_ids = ['013', '018', '019', '027', '040', '084', '091', '099']
    
    for query_id in query_ids:
        try:
            process_predicates(query_id)
            print(f"Successfully processed query {query_id}")
        except Exception as e:
            print(f"Error processing query {query_id}: {str(e)}")

if __name__ == "__main__":
    # main()

Successfully processed query 013
Successfully processed query 018
Successfully processed query 019
Successfully processed query 027
Successfully processed query 040
Successfully processed query 084
Successfully processed query 091
Successfully processed query 099


### join_predicates with table alias: 025, 050, 072, 085, 100

In [None]:
import json
import os
import re
from typing import Dict, List
from collections import defaultdict

def load_table_dict(json_file: str = '0_table_dict.json') -> Dict:
    with open(json_file, 'r') as f:
        return json.load(f)

def process_predicates(query_id: str):
    table_dict = load_table_dict()
    
    predicate_file = f"raw_predicate_tpl/query_condition_{query_id}.tpl"
    with open(predicate_file, 'r') as f:
        predicate_content = f.read()
    
    # read train & test
    training_file = f"dsb_{query_id}_original/cardinality/inputs/training/{query_id}_training_original_50.json"
    testing_file = f"dsb_{query_id}_original/cardinality/inputs/testing/{query_id}_testing_original.json"
    
    with open(training_file, 'r') as f:
        training_data = json.load(f)
        training_params = training_data[query_id]["params"]
    
    with open(testing_file, 'r') as f:
        testing_data = json.load(f)
        testing_params = testing_data[query_id]["params"]
    
    process_mode(query_id, predicate_content, training_params, table_dict, "50_training")
    process_mode(query_id, predicate_content, testing_params, table_dict, "200_testing")


def process_mode(query_id: str, predicate_content: str, params: List[List], table_dict: Dict, mode: str):
    table_conditions = defaultdict(list)
    
    for param_set in params:
        current_instance_conditions = defaultdict(list)
        current_content = predicate_content
        
        # substitute
        for i, param in enumerate(param_set):
            pattern = re.compile(rf"@param{i}\b")
            current_content = pattern.sub(str(param).strip(), current_content)
        
        # process conditions
        for line in current_content.strip().split('\n'):
            if line.strip():
                parts = line.strip().split('_')[0]
    
                # check table alias
                if '.' in parts:
                    # e.g. d1.d_moy
                    table_alias_number = parts.split('.')[0][-1]  # get 1
                    table_key = parts.split('.')[1]  # get d
                else:
                    # original case
                    table_key = parts
                    table_alias_number = None
                    
                table_name = table_dict.get(table_key)
                if table_name:
                    if table_alias_number is not None:
                        current_instance_conditions[f"{table_name}_{table_alias_number}"].append(line.strip())
                    else:
                        current_instance_conditions[table_name].append(line.strip())
        
        # for each table
        for table_name, conditions in current_instance_conditions.items():
            instance_str = '["' + '", "'.join(conditions) + '"]'
            table_conditions[table_name].append(instance_str)
    
    # output
    output_dir = f"dsb_{query_id}_original/cardinality/inputs/PQO/join_predicates"
    os.makedirs(output_dir, exist_ok=True)
    
    for table_name, instance_arrays in table_conditions.items():
        output_file = f"{output_dir}/{query_id}_{mode}_{table_name}.txt"
        with open(output_file, 'w') as f:
            f.write(',\n'.join(instance_arrays))

def main():
    query_ids = ['085']
    
    for query_id in query_ids:
        try:
            process_predicates(query_id)
            print(f"Successfully processed query {query_id}")
        except Exception as e:
            print(f"Error processing query {query_id}: {str(e)}")

if __name__ == "__main__":
    # main()

Successfully processed query 085


In [None]:
# for mixture
import json
import os
import re
from typing import Dict, List
from collections import defaultdict

def load_table_dict(json_file: str = '0_table_dict.json') -> Dict:
    with open(json_file, 'r') as f:
        return json.load(f)

def process_predicates(query_id: str):
    table_dict = load_table_dict()
    
    predicate_file = f"raw_predicate_tpl/query_condition_{query_id}.tpl"
    with open(predicate_file, 'r') as f:
        predicate_content = f.read()
    
    # read train & test
    testing_file = f"0_mixture_test/{query_id}/{query_id}_mixture_test.json"
    
    with open(testing_file, 'r') as f:
        testing_data = json.load(f)
        testing_params = testing_data[query_id]["params"]
    
    process_mode(query_id, predicate_content, testing_params, table_dict, "200_testing")


def process_mode(query_id: str, predicate_content: str, params: List[List], table_dict: Dict, mode: str):
    table_conditions = defaultdict(list)
    
    for param_set in params:
        current_instance_conditions = defaultdict(list)
        current_content = predicate_content
        
        # substitute
        for i, param in enumerate(param_set):
            pattern = re.compile(rf"@param{i}\b")
            current_content = pattern.sub(str(param).strip(), current_content)
        
        # process conditions
        for line in current_content.strip().split('\n'):
            if line.strip():
                parts = line.strip().split('_')[0]
    
                # check table alias
                if '.' in parts:
                    # e.g. d1.d_moy
                    table_alias_number = parts.split('.')[0][-1]  # get 1
                    table_key = parts.split('.')[1]  # get d
                else:
                    # original case
                    table_key = parts
                    table_alias_number = None
                    
                table_name = table_dict.get(table_key)
                if table_name:
                    if table_alias_number is not None:
                        current_instance_conditions[f"{table_name}_{table_alias_number}"].append(line.strip())
                    else:
                        current_instance_conditions[table_name].append(line.strip())
        
        # for each table
        for table_name, conditions in current_instance_conditions.items():
            instance_str = '["' + '", "'.join(conditions) + '"]'
            table_conditions[table_name].append(instance_str)
    
    # output
    output_dir = f"0_mixture_test/{query_id}/PQO"
    os.makedirs(output_dir, exist_ok=True)
    
    for table_name, instance_arrays in table_conditions.items():
        output_file = f"{output_dir}/{query_id}_{mode}_{table_name}.txt"
        with open(output_file, 'w') as f:
            f.write(',\n'.join(instance_arrays))


def main():
    query_ids = ['025', '050', '072', '085', '100']
    
    for query_id in query_ids:
        try:
            process_predicates(query_id)
            print(f"Successfully processed query {query_id}")
        except Exception as e:
            print(f"Error processing query {query_id}: {str(e)}")

if __name__ == "__main__":
    # main()

Successfully processed query 025
Successfully processed query 050
Successfully processed query 072
Successfully processed query 085
Successfully processed query 100


## template

### generate original template

In [7]:
import json
import os

def read_file_content(file_path):
    """Read and return file content."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def parse_predicate_line(line):
    """Parse a single predicate line into the required format."""
    # Initialize the predicate structure
    predicate = {
        "alias": "",
        "column": "",
        "operator": "=",
        "data_type": "text",
        "table": "",
        "join_tables": [""],
        "join_tables_alias": [""],
        "join_tables_column": [[""]],
        "join_conditions": [""],
        "left_or_right": [""]
    }
    
    return predicate

def generate_json_template(query_id):
    """Generate JSON template for the given query ID."""
    # Define file paths
    query_file_path = f"original_tpl/query{query_id}_spj.tpl"
    predicate_file_path = f"original_tpl/query_condition_{query_id}.tpl"
    output_path = f"original_template/{query_id}.json"
    
    # Read query content
    query_content = read_file_content(query_file_path)
    if query_content is None:
        return False
    
    # Read and parse predicates
    try:
        with open(predicate_file_path, 'r', encoding='utf-8') as f:
            predicate_lines = f.readlines()
            predicates = [parse_predicate_line(line.strip()) 
                         for line in predicate_lines 
                         if line.strip()]
    except FileNotFoundError:
        print(f"Error: Predicate file {predicate_file_path} not found.")
        return False
    except Exception as e:
        print(f"Error reading predicate file: {e}")
        return False
    
    # Create output directory if it doesn't exist
    os.makedirs("original_template", exist_ok=True)
    
    # Create the final JSON structure
    template = {
        query_id: {
                "query": query_content,
                "predicates": predicates
            }
        }
    
    # Write the JSON file
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(template, f, indent=4, ensure_ascii=False)
        print(f"Successfully generated {output_path}")
        return True
    except Exception as e:
        print(f"Error writing JSON file: {e}")
        return False

def main():
    """Main function to process multiple query IDs."""
    # You can modify this to process specific query IDs or read from a configuration
    query_ids = ['072', '084', '085', '091', '099', '100']
    
    for query_id in query_ids:
        print(f"\nProcessing query ID: {query_id}")
        generate_json_template(query_id)

if __name__ == "__main__":
    # main()


Processing query ID: 072
Successfully generated original_template/072.json

Processing query ID: 084
Successfully generated original_template/084.json

Processing query ID: 085
Successfully generated original_template/085.json

Processing query ID: 091
Successfully generated original_template/091.json

Processing query ID: 099
Successfully generated original_template/099.json

Processing query ID: 100
Successfully generated original_template/100.json


### generate mixture template

In [None]:
import json
import glob
import random
from pathlib import Path

def process_files(query_id):
    # Set up base path and each component's paths
    base_path = f"dsb_{query_id}_original"
    
    # Each component has its own directory with a testing file
    card_path = Path(base_path) / "cardinality"
    gauss_path = Path(base_path) / "gaussian"
    
    # Testing file path is relative to each component directory
    test_path = f"inputs/testing/{query_id}_testing_original.json"
    
    # Read data from cardinality + its testing file
    with open(card_path / test_path) as f:
        card_test_data = json.load(f)
    
    # Read data from gauss + its testing file
    with open(gauss_path / test_path) as f:
        gauss_test_data = json.load(f)

    # Verify consistency of query and predicates across all datasets
    if not verify_consistency([card_test_data, gauss_test_data], query_id):
        raise ValueError(f"Inconsistent query or predicates for {query_id}")

    # Merge params from all sources
    card_params =  card_test_data[query_id]['params']
    gauss_params = gauss_test_data[query_id]['params']
    
    # Shuffle and sample 200 sets of params (600 total)
    random.seed(2024)
    random.shuffle(card_params)
    random.shuffle(gauss_params)
    sampled_params = card_params[:100] + gauss_params[:100]
    random.shuffle(sampled_params)
    
    # Build result in required format
    result = {
        query_id: {
            "query": card_test_data[query_id]['query'],  # They should all be the same at this point
            "predicates": card_test_data[query_id]['predicates'],
            "params": sampled_params
        }
    }
    
    return result

def verify_consistency(data_list, query_id):
    """
    Verifies that query and predicates are consistent, but only checks predicates
    between the first two datasets (cardinality and gaussian).
    
    Args:
        data_list: List containing [card_test_data, csv_test_data, kepler_test_data]
        query_id: The query ID being processed
    Returns:
        True if consistent according to our rules, False otherwise
    """
    if len(data_list) < 2:  # We need all two datasets
        return False
    
    # Unpack for clarity
    card_data, gauss_data = data_list[0], data_list[1]
    
    # Check query consistency across all three datasets
    query_consistent = (
        card_data[query_id]['query'] == gauss_data[query_id]['query']
    )

    # Check predicates consistency with detailed field comparison
    card_predicates = card_data[query_id]['predicates']
    gauss_predicates = gauss_data[query_id]['predicates']

    # Ensure both lists have the same length
    if len(card_predicates) == len(gauss_predicates):
        predicates_consistent = all(
            card_predicate['alias'] == gauss_predicate['alias'] and
            card_predicate['column'] == gauss_predicate['column'] and
            card_predicate['operator'] == gauss_predicate['operator'] and
            card_predicate['data_type'] == gauss_predicate['data_type'] and
            card_predicate['table'] == gauss_predicate['table'] and
            card_predicate['join_tables'] == gauss_predicate['join_tables'] and
            card_predicate['join_tables_alias'] == gauss_predicate['join_tables_alias'] and
            card_predicate['join_tables_column'] == gauss_predicate['join_tables_column'] and
            card_predicate['join_conditions'] == gauss_predicate['join_conditions'] and
            card_predicate['left_or_right'] == gauss_predicate['left_or_right']
            for card_predicate, gauss_predicate in zip(card_predicates, gauss_predicates)
        )
    else:
        predicates_consistent = False

    
    return query_consistent and predicates_consistent


def process_all_queries():
    # Get all query IDs from directory names
    pattern = "dsb_*_original"
    all_dirs = glob.glob(pattern)
    query_ids = [d.split('_')[1] for d in all_dirs]
    print(query_ids)
    
    # Process each query and save its result separately
    for qid in query_ids:
        # Get the result for this query
        result = process_files(qid)
        
        # Create filename for this query's result
        output_dir = Path(f"0_mixture_test/{qid}")
        output_dir.mkdir(parents=True, exist_ok=True)
        output_filename = output_dir / f"{qid}_mixture_test.json"
        
        # Save this query's result to its own file
        with open(output_filename, 'w') as f:
            json.dump(result, f, indent=2)

# Execute the processing
# process_all_queries()

['040', '027', '018', '084', '025', '091', '072', '050', '085', '013', '099', '019', '100']


## kepler pipeline

### check missing files

In [1]:
import os
from typing import List


def check_paths(query_ids: List[str]) -> dict:
    """
    Check if specific paths exist for given query IDs for both cardinality and gaussian methods.
    
    Args:
        query_ids: List of query IDs to check
        
    Returns:
        Dictionary with results for each query ID and method
    """
    current_dir = os.getcwd()
    results = {}
    
    for query_id in query_ids:
        results[query_id] = {}
        
        for method in ['cardinality', 'gaussian']:
            result = {
                "original_folder_exists": False,
                "pqo_exists": False,
                "candidate_metadata_exists": False,
                "results_metadata_exists": False,
                "all_paths_exist": False,
                "missing_paths": []
            }
            
            # Construct base folder path
            original_folder = f"dsb_{query_id}_original"
            
            # Construct method-specific paths
            if method == 'cardinality':
                pqo_path = os.path.join(
                    current_dir,
                    original_folder,
                    method,
                    "inputs",
                    "PQO",
                    "join_predicates"
                )
            else:  # gaussian
                pqo_path = os.path.join(
                    current_dir,
                    original_folder,
                    method,
                    "inputs",
                    "PQO"
                )
            
            candidate_metadata_path = os.path.join(
                current_dir,
                original_folder,
                method,
                "outputs",
                "hints",
                str(query_id),
                "training_50",
                "candidate_metadata.json"
            )
            
            results_metadata_path = os.path.join(
                current_dir,
                original_folder,
                method,
                "outputs",
                "results",
                str(query_id),
                "training_50",
                "metadata.json"
            )
            
            # Check existence and collect missing paths
            paths_to_check = {
                "original_folder": (original_folder, "original_folder_exists"),
                "pqo": (pqo_path, "pqo_exists"),
                "candidate_metadata": (candidate_metadata_path, "candidate_metadata_exists"),
                "results_metadata": (results_metadata_path, "results_metadata_exists")
            }
            
            for path_name, (path, result_key) in paths_to_check.items():
                exists = os.path.exists(path)
                result[result_key] = exists
                if not exists:
                    result["missing_paths"].append(path)
            
            # Check if all paths exist
            result["all_paths_exist"] = all([
                result["original_folder_exists"],
                result["pqo_exists"],
                result["candidate_metadata_exists"],
                result["results_metadata_exists"]
            ])
            
            results[query_id][method] = result
    
    return results


def print_missing_paths(results: dict):
    """
    Print only the missing paths for each query ID and method.
    
    Args:
        results: Dictionary containing check results for each query ID and method
    """
    has_missing_paths = False
    
    for query_id, methods in results.items():
        missing_in_query = False
        method_results = []
        
        for method, result in methods.items():
            if not result["all_paths_exist"]:
                has_missing_paths = True
                missing_in_query = True
                method_results.append((method, result["missing_paths"]))
        
        if missing_in_query:
            print(f"\nMissing paths for Query ID {query_id}:")
            for method, missing_paths in method_results:
                print(f"\n  Method: {method}")
                for path in missing_paths:
                    print(f"  ✗ {path}")
    
    if not has_missing_paths:
        print("\nAll paths exist for all queries and methods.")


def main():
    """
    Main function to run the path checker.
    """
    try:
        query_ids = ['013', '018', '019',
                    '025', '027', '040',
                    '050', '072', '084', '085',
                    '091', '099', '100']
        
        # Check paths
        results = check_paths(query_ids)
        
        # Print only missing paths
        print_missing_paths(results)
        
    except ValueError:
        print("Error: Please enter valid query IDs")
    except Exception as e:
        print(f"An error occurred: {str(e)}")


if __name__ == "__main__":
    main()


Missing paths for Query ID 100:

  Method: gaussian
  ✗ /home/lsh/test_kepler/kepler/dsb_cardrange_workload/dsb_100_original/gaussian/outputs/results/100/training_50/metadata.json


### generate kepler format's gaussian workload files

In [10]:
import json

def analyze_json_files(query_id):
    with open('PQO_gaussian_sql.json', 'r') as f:
        gaussian_data = json.load(f)
    
    query_data = gaussian_data[query_id]

    # train & test param combination length
    train_length = len(query_data['train'])
    test_length = len(query_data['test'])
    print(f"\nAnalyzing query_id: {query_id}")
    print(f"Train list length: {train_length} - Test list length: {test_length}")
    
    # train & test param length
    train_inner_lengths = [len(x) for x in query_data['train']]
    test_inner_lengths = [len(x) for x in query_data['test']]
    print(f"Train inner list lengths: - All equal: {len(set(train_inner_lengths)) == 1}")
    print(f"Length(s): {list(set(train_inner_lengths))}")
    print(f"Test inner list lengths: - All equal: {len(set(test_inner_lengths)) == 1}")
    print(f"Length(s): {list(set(test_inner_lengths))}")

for query_id in ['013', '018', '019', '025', '027', '040',
                 '050', '072', '084', '085', '091', '099', '100']:
    analyze_json_files(query_id)


Analyzing query_id: 013
Train list length: 50 - Test list length: 200
Train inner list lengths: - All equal: True
Length(s): [6]
Test inner list lengths: - All equal: True
Length(s): [6]

Analyzing query_id: 018
Train list length: 50 - Test list length: 200
Train inner list lengths: - All equal: True
Length(s): [6]
Test inner list lengths: - All equal: True
Length(s): [6]

Analyzing query_id: 019
Train list length: 50 - Test list length: 200
Train inner list lengths: - All equal: True
Length(s): [5]
Test inner list lengths: - All equal: True
Length(s): [5]

Analyzing query_id: 025
Train list length: 50 - Test list length: 200
Train inner list lengths: - All equal: True
Length(s): [3]
Test inner list lengths: - All equal: True
Length(s): [3]

Analyzing query_id: 027
Train list length: 50 - Test list length: 200
Train inner list lengths: - All equal: True
Length(s): [4]
Test inner list lengths: - All equal: True
Length(s): [4]

Analyzing query_id: 040
Train list length: 50 - Test list l

In [None]:
import json

def process_element(element, param_info):
    idx, *split_params = param_info
    
    # find target sql stmt
    target_sql = element[idx]
    
    if len(split_params) == 1:
        # str1
        str1 = split_params[0]
        return target_sql.split(str1)[1]
    
    elif len(split_params) == 2:
        # str1, str2
        str1, str2 = split_params
        return target_sql.split(str1)[1].split(str2)[0]
    
    elif len(split_params) == 3:
        # str1, str2, str3
        str1, str2, str3 = split_params
        if str3 == "TODO: pick last":
            return target_sql.split(str1)[1].split(str2)[1]
        else:
            raise ValueError(f"Unsupported str3 value: {str3}")
    
    else:
        raise ValueError(f"Unexpected number of split parameters: {len(split_params)}")

def process_data(split_data_dict, raw_data):
    result = {}
    
    # all query_id
    for query_id, param_dict in split_data_dict.items():
        if query_id not in raw_data:
            continue
            
        result[query_id] = {"train": [], "test": []}
        
        # get number of params
        num_params = len(param_dict)
        print(query_id, num_params)
        
        # train
        for element in raw_data[query_id]["train"]:
            processed = []
            for i in range(num_params):
                param_key = f"@param{i}"
                param_info = param_dict[param_key]
                processed.append(process_element(element, param_info))
            result[query_id]["train"].append(processed)
            
        # test
        for element in raw_data[query_id]["test"]:
            processed = []
            for i in range(num_params):
                param_key = f"@param{i}"
                param_info = param_dict[param_key]
                processed.append(process_element(element, param_info))
            result[query_id]["test"].append(processed)
    
    return result

def main():
    with open('PQO_gaussian_split_dict.json', 'r') as f:
        split_data_dict = json.load(f)
    
    with open('PQO_gaussian_sql.json', 'r') as f:
        raw_data = json.load(f)
    
    result = process_data(split_data_dict, raw_data)
    
    with open('PQO_gaussian_params.json', 'w') as f:
        json.dump(result, f, indent=2)

if __name__ == "__main__":
    # main()

013 9
018 8
019 7
025 6
027 6
040 7
050 3
072 8
084 2
085 10
091 4
099 6
100 10


In [5]:
import json
import os
from pathlib import Path

def main():
    template_dir = Path('original_template')
    
    json_files = sorted([f for f in os.listdir(template_dir) if f.endswith('.json')])
    
    for filename in json_files:
        query_id = filename[:-5]
        
        with open(template_dir / filename, 'r') as f:
            data = json.load(f)
        
        predicates_length = len(data[query_id]["predicates"])
        print(f"{query_id} {predicates_length}")

if __name__ == "__main__":
    main()

013 9
018 8
019 7
025 6
027 6
040 7
050 3
072 8
084 2
085 10
091 4
099 6
100 10


In [None]:
import csv
import collections

# meteadata
def save_metadata_to_csv(output_dir, metadata, query_id):
    """
    Saves the metadata to a CSV file in the output directory.
    
    Args:
    - output_dir: The directory where the metadata.csv will be saved.
    - metadata: A dictionary containing the metadata information to save.
    """
    metadata_file = os.path.join(output_dir, f"{query_id}.csv")
    
    # Write the metadata to CSV
    with open(metadata_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        # Write headers
        writer.writerow(["Key", "Value"])
        
        # Write metadata
        for key, value in metadata.items():
            writer.writerow([key, value])

    print(f"Metadata saved to {metadata_file}")


# frequency
def get_literal_frequencies(literals):
    """
    calculate param with frequency
    """
    frequency_dict = collections.defaultdict(int)
    
    for literal in literals:
        frequency_dict[json.dumps(literal)] += 1
    
    return frequency_dict


def split_literals_and_store_with_frequency(query_id, train_literals, test_literals, output_dir, train_size=50, test_size=200):
    os.makedirs(output_dir, exist_ok=True)
    
    # get distinct train & test literals
    distinct_train_literals = list(set(map(tuple, train_literals)))
    distinct_train_literals = [list(item) for item in distinct_train_literals]
    
    distinct_test_literals = list(set(map(tuple, test_literals)))
    distinct_test_literals = [list(item) for item in distinct_test_literals]
    
    # calculate frequency
    train_literal_frequencies = get_literal_frequencies(train_literals)
    test_literal_frequencies = get_literal_frequencies(test_literals)
    
    # Store all train_dicts in a dictionary, keyed by train_size
    train_dict_dict = {}
    full_train_dict = {json.dumps(literal): train_literal_frequencies[json.dumps(literal)] for literal in distinct_train_literals}
    train_dict_dict[len(train_literals)] = full_train_dict
    test_dict = {json.dumps(literal): test_literal_frequencies[json.dumps(literal)] for literal in distinct_test_literals}
    
    # Get the subset of train_literals based on the current train_size
    train_subset = train_literals
    new_train_literal_freq = get_literal_frequencies(train_subset)

    # Get distinct train literals for this subset
    distinct_train_subset = list(set(map(tuple, train_subset)))
    distinct_train_subset = [list(item) for item in distinct_train_subset]
    
    # Create a frequency dictionary for this subset
    train_dict = {json.dumps(literal): new_train_literal_freq[json.dumps(literal)] for literal in distinct_train_subset}
    
    # Add the current train size dictionary to the list of dictionaries
    train_dict_dict[train_size] = train_dict
    
    
    # Ensure the directory exists
    base_dir = "frequency"
    output_dir_path = os.path.join(output_dir, base_dir)
    os.makedirs(output_dir_path, exist_ok=True)

    # Save each train literal dict with frequency based on train_size_list, including full size
    for train_size, train_dict in train_dict_dict.items():
        train_output_file = os.path.join(output_dir_path, f"{query_id}_train_{train_size}_freq.json")
        with open(train_output_file, 'w') as train_file:
            json.dump(train_dict, train_file, indent=4)

    # Save test literals with frequency
    test_output_file = os.path.join(output_dir_path, f"{query_id}_test_freq.json")
    with open(test_output_file, 'w') as test_file:
        json.dump(test_dict, test_file, indent=4)
    
    
    return train_dict_dict, test_dict


###################
# Ourpur directories
def prepare_directories(output_dir):
    """
    Prepares the directory structure for storing the templates and PQO files.

    Args:
    output_dir (str): The base directory for saving files.

    Returns:
    dict: A dictionary with the paths to various directories.
    """
    dirs = {
        'training': os.path.join(output_dir, 'training'),
        'testing': os.path.join(output_dir, 'testing'),
        'metadata': os.path.join(output_dir, 'metadata')
    }

    # Create the directories if they don't exist
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)

    return dirs
   
    
def load_template(template_file, query_id):
    """
    Load the query and predicates from a JSON template file.
    
    Args:
    template_file (str): Path to the JSON file containing the query templates.
    
    Returns:
    tuple: query_id, query, and predicates from the first template.
    """
    # Load the JSON template file
    with open(template_file, 'r') as file:
        templates = json.load(file)
    
    # Extract the corresponding query template
    template = templates[query_id]
    query = template['query']
    predicates = template['predicates']
    
    # Check if 'params' exists in the template, if not set it to None
    original_param_list = template.get('params', None)
    
    return query, predicates, original_param_list


def save_templates_and_pqo(query_id, query, predicates, training_params, testing_params, test_literals, dirs, train_size, mode="original"):
    """
    Save the full, training, and testing templates, and corresponding PQO files.

    Args:
    query_id (str): The ID of the query.
    query (str): The query string.
    predicates (list): List of predicates.
    training_params (list): List of training parameters.
    testing_params (list): List of testing parameters.
    dirs (dict): A dictionary containing the paths to the directories for saving the files.
    """
    # Create training, and testing template dictionaries
    training_template = {
        query_id: {
            "query": query,
            "predicates": predicates,
            "params": training_params
        }
    }

    testing_template = {
        query_id: {
            "query": query,
            "predicates": predicates,
            "params": test_literals if mode == "original" else testing_params # from PQO - same order of the params
        }
    }

    # Save training template
    training_output_file = os.path.join(dirs['training'], f"{query_id}_training_{mode}_{train_size}.json")
    with open(training_output_file, 'w') as out_file:
        json.dump(training_template, out_file, indent=4)

    # Save testing template
    testing_output_file = os.path.join(dirs['testing'], f"{query_id}_testing_{mode}.json")
    if os.path.exists(testing_output_file):
        print(f"{query_id}_testing_{mode}.json already exists, skipping save.")
    else:
        with open(testing_output_file, 'w') as out_file:
            json.dump(testing_template, out_file, indent=4)

    
# generate training & testing set
def generate_param_list_from_dict(data_dict, mode):
    param_list = []
    if mode == 'distinct':
        # In 'distinct' mode, use keys (unique literals) only once
        param_list = [eval(key) for key in data_dict.keys()]
    elif mode == 'original':
        # In 'original' mode, replicate keys by their frequencies
        for key, freq in data_dict.items():
            param_list.extend([eval(key)] * freq)
    return param_list


# generation based on selection
def process_train_and_test_data(output_dir, query_id, query, predicates, train_dict_dict, test_dict, test_literals):
    # Initialize a metadata dictionary to store relevant information
    metadata = {}

    # Generate params for training and testing
    for mode in ["original", "distinct"]:
        for train_size in sorted(train_dict_dict.keys()):  # train_size_list + full size
            training_params = generate_param_list_from_dict(train_dict_dict[train_size], mode)
            testing_params = generate_param_list_from_dict(test_dict, mode)
            print(f"{mode} (train size: {train_size}): training {len(training_params)}, testing {len(testing_params)}")

            # Save metadata for each mode
            metadata[f"{mode}_testing_params"] = len(testing_params)
            metadata[f"{mode}_{train_size}_training_params"] = len(training_params)

            # Save templates and PQO for each train size
            save_templates_and_pqo(query_id, query, predicates, training_params, testing_params, test_literals, output_dir, train_size, mode)

    # Save metadata
    save_metadata_to_csv(output_dir['metadata'], metadata, query_id)



def main():
    # initialize
    # for query_id in ['013', '018', '019', '025', '027', '040',
    #              '050', '072', '084', '085', '091', '099', '100']:
    for query_id in ['013']:
        output_dir = f"dsb_{query_id}_original"
        base_dir = "gaussian/inputs"
        full_output_dir = os.path.join(output_dir, base_dir)
        template_file = f"original_template/{query_id}.json"
        
        # preparation
        dirs = prepare_directories(full_output_dir)
        query, predicates, _ = load_template(template_file, query_id)
        
        # params
        with open('PQO_gaussian_params.json', 'r') as f:
            param_file = json.load(f)
        train_literals = param_file[query_id]["train"]
        print(train_literals[:2])
        test_literals = param_file[query_id]["test"]
        print(test_literals[:2])
        
        train_dict_dict, test_dict = split_literals_and_store_with_frequency(query_id, train_literals, test_literals, full_output_dir)
        process_train_and_test_data(dirs, query_id, query, predicates, train_dict_dict, test_dict, test_literals)

if __name__ == "__main__":
    # main()

[['D', '4 yr Degree', 'D', '4 yr Degree', 'D', '4 yr Degree', "MN', 'NC', 'TX", "GA', 'NE', 'SC", "CA', 'CT', 'NY"], ['D', 'Advanced Degree', 'S', 'Advanced Degree', 'D', 'College', "IA', 'ND', 'TX", "MT', 'TN', 'VA", "AR', 'IN', 'TX"]]
[['M', '4 yr Degree', 'D', '4 yr Degree', 'D', '4 yr Degree', "MI', 'SD', 'TN", "MI', 'ND', 'TX", "MI', 'MN', 'WY"], ['W', 'Advanced Degree', 'S', 'Advanced Degree', 'W', '4 yr Degree', "MS', 'TN', 'VA", "CA', 'FL', 'ND", "KY', 'MN', 'VT"]]
original (train size: 50): training 50, testing 200
distinct (train size: 50): training 50, testing 200
Metadata saved to dsb_013_original/gaussian/inputs/metadata/013.csv


In [7]:
import json
import os

def load_json_file(file_path):
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading file {file_path}: {str(e)}")
        return None

def convert_list_to_tuple(lst):
    return tuple(convert_list_to_tuple(x) if isinstance(x, list) else x for x in lst)

def compare_lists_exactly(list1, list2):
    list1 = sorted([tuple(ele) for ele in list1])
    list2 = sorted([tuple(ele) for ele in list2])
    return list1 == list2

def validate_files(query_id, mode='testing'):
    # initialize
    base_path = f'dsb_{query_id}_original/gaussian/inputs/{mode}'
    distinct_file = f'{base_path}/{query_id}_{mode}_distinct{"_50" if mode=="training" else ""}.json'
    original_file = f'{base_path}/{query_id}_{mode}_original{"_50" if mode=="training" else ""}.json'
    pqo_file = 'PQO_gaussian_params.json'
    
    # load files
    distinct_data = load_json_file(distinct_file)
    original_data = load_json_file(original_file)
    pqo_data = load_json_file(pqo_file)
    
    if not all([distinct_data, original_data, pqo_data]):
        print("Error: Some files could not be loaded")
        return False
    
    try:
        pqo_params = pqo_data[query_id][mode[:-3]]
        original_params = original_data[query_id]["params"]
            
        # sorted, check the same
        if not compare_lists_exactly(original_params, pqo_params):
            print(f"Error: {mode} original params and PQO params don't match exactly")
            return False
            
        print(f"Validation successful for {mode} files!")
        return True
        
    except KeyError as e:
        print(f"Error: Required key not found in JSON structure: {str(e)}")
        return False

def main():
    for query_id in ['013', '018', '019', '025', '027', '040',
                 '050', '072', '084', '085', '091', '099', '100']:
    
        # validate testing files
        print("\nValidating testing files...")
        validate_files(query_id, 'testing')
        
        # validate training files
        print("\nValidating training files...")
        validate_files(query_id, 'training')

if __name__ == "__main__":
    main()


Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Validating testing files...
Validation successful for testing files!

Validating training files...
Validation successful for training files!

Valid

## generate plans

### generate original plans

In [1]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, training_size, confidence_threshold):
    """
    Process data for a given query_id and training_size combination.
    
    Args:
        query_id (str): The ID of the query to process
        training_size (int): The training size to process
    """
    # Define the methods to process - now using the new set of methods
    methods = ['cardinality', 'gaussian']
    
    for method in methods:
        # Form the predictions file path using the new structure
        predictions_path = f'dsb_{query_id}_original/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_{confidence_threshold}/predictions/'
        
        # Find the file that matches the pattern starting with query_id and ending with _batch_0.csv
        predictions_file = None
        try:
            for filename in os.listdir(predictions_path):
                if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                    predictions_file = os.path.join(predictions_path, filename)
                    break
        except FileNotFoundError:
            print(f"Directory not found for {method}, query_id={query_id}, training_size={training_size}")
            continue
                
        if predictions_file is None:
            print(f"No prediction file found for {method}, query_id={query_id}, training_size={training_size}")
            continue
        
        # Read and process the predictions file
        predictions_df = pd.read_csv(predictions_file)
        
        # Read the testing data JSON
        testing_json_path = f'dsb_{query_id}_original/{method}/inputs/testing/{query_id}_testing_original.json'
        try:
            with open(testing_json_path, 'r') as f:
                query_data = json.load(f)
        except FileNotFoundError:
            print(f"Testing JSON not found for query_id={query_id}")
            continue
        
        # Process each prediction row
        results = []
        
        for _, pred_row in predictions_df.iterrows():
            params = pred_row['params']
            plan_id = pred_row['plan_id']
            plan_content = pred_row['plan_content']
            
            # Convert params from string representation to list
            params_list = eval(params)
            
            # Get full query instance
            query = query_data[query_id]['query']
            
            # Replace parameters in query with actual values
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            results.append({
                'query': query,
                'plan_id': plan_id,
                'plan_content': plan_content
            })
        
        # Save results to CSV
        output_df = pd.DataFrame(results)
        os.makedirs(f'0_dsb_original_plans_{confidence_threshold}', exist_ok=True)
        output_filename = f'0_dsb_original_plans_{confidence_threshold}/{query_id}_workload_{method}_training_size_{training_size}.csv'
        output_df.to_csv(output_filename, index=False)
        print(f"Generated: {output_filename}")

# Define separate lists for query_ids and training sizes
query_ids = ['091', '099', '100']
# '013', '018', '019', '025', '027', '040', '050', '072', '084', '085', '091', '099', '100'
training_sizes = [50]
confidence_thresholds = ["0"]

# Generate all combinations using itertools.product
combinations = list(product(query_ids, training_sizes, confidence_thresholds))

# Display the total number of combinations to be processed
print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, training_size, confidence_threshold in combinations:
    print(f"\nProcessing query_id={query_id}, training_size={training_size}, confidence_threshold={confidence_threshold}")
    process_data(query_id, training_size, confidence_threshold)

Processing 3 combinations...

Processing query_id=091, training_size=50, confidence_threshold=0
Generated: 0_dsb_original_plans_0/091_workload_cardinality_training_size_50.csv
Generated: 0_dsb_original_plans_0/091_workload_gaussian_training_size_50.csv

Processing query_id=099, training_size=50, confidence_threshold=0
Generated: 0_dsb_original_plans_0/099_workload_cardinality_training_size_50.csv
Generated: 0_dsb_original_plans_0/099_workload_gaussian_training_size_50.csv

Processing query_id=100, training_size=50, confidence_threshold=0
Generated: 0_dsb_original_plans_0/100_workload_cardinality_training_size_50.csv
Generated: 0_dsb_original_plans_0/100_workload_gaussian_training_size_50.csv


### generate mixture plans

In [2]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, training_size):
    """
    Process data for a given query_id and training_size combination.
    
    Args:
        query_id (str): The ID of the query to process
        training_size (int): The training size to process
    """
    # Define the methods to process - now using the new set of methods
    methods = ['cardinality', 'gaussian']
    
    for method in methods:
        # Form the predictions file path using the new structure
        predictions_path = f'0_mixture_test/{query_id}/{method}/training_{training_size}/confidence_0/predictions/'
        
        # Find the file that matches the pattern starting with query_id and ending with _batch_0.csv
        predictions_file = None
        try:
            for filename in os.listdir(predictions_path):
                if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                    predictions_file = os.path.join(predictions_path, filename)
                    break
        except FileNotFoundError:
            print(f"Directory not found for {method}, query_id={query_id}, training_size={training_size}")
            continue
                
        if predictions_file is None:
            print(f"No prediction file found for {method}, query_id={query_id}, training_size={training_size}")
            continue
        
        # Read and process the predictions file
        predictions_df = pd.read_csv(predictions_file)
        
        # Read the testing data JSON
        testing_json_path = f'0_mixture_test/{query_id}/{query_id}_mixture_test.json'
        try:
            with open(testing_json_path, 'r') as f:
                query_data = json.load(f)
        except FileNotFoundError:
            print(f"Testing JSON not found for query_id={query_id}")
            continue
        
        # Process each prediction row
        results = []
        for _, pred_row in predictions_df.iterrows():
            params = pred_row['params']
            plan_id = pred_row['plan_id']
            plan_content = pred_row['plan_content']
            
            # Convert params from string representation to list
            params_list = eval(params)
            
            # Get full query instance
            query = query_data[query_id]['query']
            
            # Replace parameters in query with actual values
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            results.append({
                'query': query,
                'plan_id': plan_id,
                'plan_content': plan_content
            })
        
        # Save results to CSV
        output_df = pd.DataFrame(results)
        os.makedirs('0_mixture_plans', exist_ok=True)
        output_filename = f'0_mixture_plans/{query_id}_workload_{method}_training_size_{training_size}.csv'
        output_df.to_csv(output_filename, index=False)
        print(f"Generated: {output_filename}")

# Define separate lists for query_ids and training sizes
query_ids = ['091', '099', '100']
# '013', '018', '019', '025', '027', '040', '050', '072', '084', '085', '091', '099', '100'
training_sizes = [50]

# Generate all combinations using itertools.product
combinations = list(product(query_ids, training_sizes))

# Display the total number of combinations to be processed
print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, training_size in combinations:
    print(f"\nProcessing query_id={query_id}, training_size={training_size}")
    process_data(query_id, training_size)

Processing 3 combinations...

Processing query_id=091, training_size=50
Generated: 0_mixture_plans/091_workload_cardinality_training_size_50.csv
Generated: 0_mixture_plans/091_workload_gaussian_training_size_50.csv

Processing query_id=099, training_size=50
Generated: 0_mixture_plans/099_workload_cardinality_training_size_50.csv
Generated: 0_mixture_plans/099_workload_gaussian_training_size_50.csv

Processing query_id=100, training_size=50
Generated: 0_mixture_plans/100_workload_cardinality_training_size_50.csv
Generated: 0_mixture_plans/100_workload_gaussian_training_size_50.csv


## Others

### original: check the min-max confidence for each query_id

In [3]:
import os
import re
import pandas as pd
from pathlib import Path

def analyze_confidence_ranges(query_id):
    methods = ['cardinality', 'gaussian']
    training_sizes = ['50']
    results = []
    
    for method in methods:
        for training_size in training_sizes:
            base_path = f'dsb_{query_id}_original/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_0/predictions/'
            
            if not os.path.exists(base_path):
                print(f"Warning: Directory not found: {base_path}")
                continue
                
            pattern = re.compile(f"{query_id}_.*_batch_0\.csv$")
            matching_files = []
            
            for file in os.listdir(base_path):
                if pattern.match(file):
                    matching_files.append(os.path.join(base_path, file))
            
            if not matching_files:
                print(f"Warning: No matching files found for {method} - training_{training_size}")
                continue
            
            all_confidences = []
            for file in matching_files:
                try:
                    df = pd.read_csv(file)
                    if 'confidence' in df.columns:
                        all_confidences.extend(df['confidence'].tolist())
                    else:
                        print(f"Warning: No confidence column in {file}")
                except Exception as e:
                    print(f"Error reading file {file}: {e}")
                    continue
            
            if all_confidences:
                min_conf = min(all_confidences)
                max_conf = max(all_confidences)
                
                results.append({
                    'query_id': query_id,
                    'method': method,
                    'training_size': training_size,
                    'min_confidence': min_conf,
                    'max_confidence': max_conf
                })
            
    return results

if __name__ == "__main__":
    query_ids = ['013', '018', '019', '025', '027', '040', '050', '072', '084', '085', '091', '099', '100']
    # '027', '040', '050', '072', '084', '085', '091', '099', '100'
    all_results = []
    
    for query_id in query_ids:
        all_results.extend(analyze_confidence_ranges(query_id))
    
    results_df = pd.DataFrame(all_results)
    output_dir = '0_original_analysis'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'confidence_range.csv')
    results_df.to_csv(output_file, index=False, mode='w')
    print(f"Results saved to {output_file}")

Results saved to 0_original_analysis/confidence_range.csv


### original: training time

In [6]:
import os
import json
import csv
from pathlib import Path

def process_training_times():
    base_dir = "."
    output_dir = "0_original_analysis"
    
    os.makedirs(output_dir, exist_ok=True)
    results = []
    
    for dir_name in os.listdir(base_dir):
        if dir_name.startswith("dsb_"):
            query_id = dir_name.split("_")[1]
            base_path = Path(base_dir) / dir_name
            
            for method in ['cardinality', 'gaussian']:
                method_path = base_path / method
                
                try:
                    # get candidate plan generation time
                    candidate_metadata_path = method_path / "outputs" / "hints" / query_id / "training_50" / "candidate_metadata.json"
                    with open(candidate_metadata_path) as f:
                        candidate_data = json.load(f)
                        candidate_time = candidate_data["candidate_plan_generation_time_seconds"] / 60
                    
                    # get training data generation time
                    training_metadata_path = method_path / "outputs" / "results" / query_id / "training_50" / "metadata.json"
                    with open(training_metadata_path) as f:
                        training_data = json.load(f)
                        training_time = training_data["training_data_all_time_seconds"] / 60
                    
                    # get model prediction time
                    eval_metadata_path = method_path / "outputs" / "evaluation" / query_id / "training_50" / "confidence_0" / "metadata.json"
                    with open(eval_metadata_path) as f:
                        eval_data = json.load(f)
                        prediction_time = eval_data["model_prediction_time"]
                    
                    # get training & testing time
                    model_metadata_path = method_path / "outputs" / "evaluation_single" / query_id / "training_50" / "confidence_0" / "metadata.json"
                    with open(model_metadata_path) as f:
                        model_data = json.load(f)
                        model_train_time = model_data["model_train_time"]
                        model_test_time = model_data["model_predict_time (already * 200)"]
                    
                    # add result
                    results.append({
                        "query_id": query_id,
                        "method": method,
                        "candidate_time": round(candidate_time, 2),
                        "training_time": round(training_time, 2),
                        "prediction_time": round(prediction_time, 2),
                        "model_train_time": round(model_train_time, 2),
                        "model_predict_time": round(model_test_time, 2)
                    })
                    
                except FileNotFoundError as e:
                    print(f"ERROR: {e}")
                except json.JSONDecodeError as e:
                    print(f"ERROR: {e}")
                except KeyError as e:
                    print(f"ERROR: {e}")
    
    # training_time.csv
    output_file = Path(output_dir) / "training_time.csv"
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "query_id",
            "method",
            "candidate plan generation time (mins)",
            "robust plan set generation (mins)",
            "model prediction time (ms)",
            "model train time (ms)",
            "model predict time (ms)"
        ])
        
        writer.writeheader()
        for row in sorted(results, key=lambda x: (x["query_id"])):
            writer.writerow({
                "query_id": row["query_id"],
                "method": row["method"],
                "candidate plan generation time (mins)": row["candidate_time"],
                "robust plan set generation (mins)": row["training_time"],
                "model prediction time (ms)": row["prediction_time"],
                "model train time (ms)": row["model_train_time"],
                "model predict time (ms)": row["model_predict_time"]
            })

if __name__ == "__main__":
    process_training_times()

### original: robust plan count

In [4]:
import json
import os
import pandas as pd
from pathlib import Path

def process_json_files(query_ids, methods, training_sizes):
    # Initialize list to store results
    results = []
    
    for query_id in query_ids:
        for method in methods:
            for training_size in training_sizes:
                # Construct file path
                file_path = f"dsb_{query_id}_original/{method}/outputs/results/{query_id}/training_{training_size}/execution_output/dsb_{query_id}_metadata.json"
                
                # Check if file exists
                if os.path.exists(file_path):
                    try:
                        # Read and parse JSON file
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            
                        # Extract plan_cover length
                        plan_cover_length = len(data[str(query_id)]["plan_cover"])
                        
                        # Add result to list
                        results.append({
                            'query-id': query_id,
                            'method': method,
                            'training_size': training_size,
                            'plan_cover_length': plan_cover_length
                        })
                    except (json.JSONDecodeError, KeyError, TypeError) as e:
                        print(f"Error processing file {file_path}: {str(e)}")
                else:
                    print(f"File not found: {file_path}")
    
    # Create DataFrame and save to CSV
    if results:
        df = pd.DataFrame(results)
        output_dir = "0_original_analysis"
        os.makedirs(output_dir, exist_ok=True)
        
        output_file = f"{output_dir}/plan_cover_lengths.csv"
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No valid files were found to process")

# Configuration
query_ids = ['013', '018', '019', '025', '027', '040', '050', '072', '084', '085', '091', '099', '100']
# '027', '040', '050', '072', '084', '085', '091', '099', '100'
methods = ['cardinality', 'gaussian']
training_sizes = [50]

process_json_files(query_ids, methods, training_sizes)

Results saved to 0_original_analysis/plan_cover_lengths.csv


### original & mixture unique plan count

In [5]:
import os
import pandas as pd
import json
from collections import Counter, defaultdict

def analyze_workload_files(directory):
    results = defaultdict(lambda: defaultdict(dict))
    
    for filename in os.listdir(directory):
        if filename.endswith('training_size_50.csv') and 'workload' in filename:
            query_id = filename.split('_')[0]
            method = filename.split('_')[2]
            
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            plan_counter = Counter(df['plan_id'])
            results[query_id][method] = dict(plan_counter)
    
    return dict(results)

def main():
    # mixture plans
    input_directory = '0_mixture_plans'
    output_file = '0_original_analysis/mixture_plan_statistics.json'
    result_dict = analyze_workload_files(input_directory)
    
    with open(output_file, 'w') as f:
        json.dump(result_dict, f, indent=2)
    
    # original plans
    input_directory = '0_dsb_original_plans_0'
    output_file = '0_original_analysis/original_plan_statistics.json'
    result_dict = analyze_workload_files(input_directory)
    
    with open(output_file, 'w') as f:
        json.dump(result_dict, f, indent=2)

if __name__ == "__main__":
    main()

### original: single testing param file generate

In [6]:
import json
import os
import shutil

def process_json_file(query_id, method):
    base_path = "."
    input_path = os.path.join(
        base_path,
        f"dsb_{query_id}_original",
        method,
        "inputs",
        "testing",
        f"{query_id}_testing_original.json"
    )
    
    output_path = os.path.join(
        base_path,
        f"dsb_{query_id}_original",
        method,
        "inputs",
        "testing",
        f"{query_id}_testing_original_single.json"
    )
    
    try:
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
            
        # read original file
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        if query_id not in data:
            raise KeyError(f"Query ID {query_id} not found in the JSON data")
            
        if "params" not in data[query_id]:
            raise KeyError(f"'params' not found in data[{query_id}]")
            
        # only keep the first one
        if len(data[query_id]["params"]) > 0:
            data[query_id]["params"] = [data[query_id]["params"][0]]
        
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
            
        print(f"Successfully created: {output_path}")
        return True
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return False

def main():
    for query_id in ['013', '018', '019', '025', '027', '040', '050', '072', '084', '085', '091', '099', '100']:
        for method in ['cardinality', 'gaussian']:
            success = process_json_file(query_id, method)
            if success:
                print("Processing completed successfully")
            else:
                print("Processing failed")

if __name__ == "__main__":
    main()

Successfully created: ./dsb_013_original/cardinality/inputs/testing/013_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_013_original/gaussian/inputs/testing/013_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_018_original/cardinality/inputs/testing/018_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_018_original/gaussian/inputs/testing/018_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_019_original/cardinality/inputs/testing/019_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_019_original/gaussian/inputs/testing/019_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_025_original/cardinality/inputs/testing/025_testing_original_single.json
Processing completed successfully
Successfully created: ./dsb_025_original/gaussian/inputs/testing

### Gaussian template modification: find (min, max) value from the params

In [1]:
import json
def analyze_param_combinations(PQO_gaussian_params):
    results = {}
    
    # all_query_id
    for query_id, data in PQO_gaussian_params.items():
        results[query_id] = {
            'train': [],
            'test': []
        }
        
        # train & test
        for data_type in ['train', 'test']:
            param_combinations = data[data_type]
            if not param_combinations:  # 如果列表为空
                continue
                
            # find the first combination
            first_combination = param_combinations[0]
            
            # test whether the param is an integer
            for idx, element in enumerate(first_combination):
                try:
                    # try for integer
                    int(element)
                    
                    # get all values
                    values = []
                    for combination in param_combinations:
                        try:
                            values.append(int(combination[idx]))
                        except (ValueError, IndexError):
                            continue
                    
                    if values:  # find min & max
                        results[query_id][data_type].append({
                            'position': idx,
                            'min_value': min(values),
                            'max_value': max(values)
                        })
                        
                except ValueError:
                    continue
    
    # print result
    for query_id, data in results.items():
        print(f"\nQuery ID: {query_id}")
        
        for data_type in ['train', 'test']:
            if data[data_type]:
                print(f"  {data_type.capitalize()}:")
                for result in data[data_type]:
                    print(f"    Position {result['position']}: "
                          f"min = {result['min_value']}, "
                          f"max = {result['max_value']}")


with open('PQO_gaussian_params.json', 'r') as f:
    PQO_gaussian_params = json.load(f)
analyze_param_combinations(PQO_gaussian_params)


Query ID: 013

Query ID: 018
  Train:
    Position 2: min = 1998, max = 2002
    Position 3: min = 1, max = 12
    Position 5: min = 4, max = 95
    Position 6: min = 9, max = 100
  Test:
    Position 2: min = 1998, max = 2002
    Position 3: min = 1, max = 12
    Position 5: min = 0, max = 100
    Position 6: min = 5, max = 105

Query ID: 019
  Train:
    Position 1: min = 1998, max = 2002
    Position 2: min = 1, max = 12
    Position 4: min = 1, max = 12
    Position 5: min = 2, max = 80
    Position 6: min = 22, max = 100
  Test:
    Position 1: min = 1998, max = 2002
    Position 2: min = 1, max = 12
    Position 4: min = 1, max = 12
    Position 5: min = 1, max = 80
    Position 6: min = 21, max = 100

Query ID: 025
  Train:
    Position 0: min = 1, max = 10
    Position 1: min = 1998, max = 2002
    Position 2: min = 1, max = 10
    Position 3: min = 1998, max = 2002
    Position 4: min = 1, max = 10
    Position 5: min = 1998, max = 2002
  Test:
    Position 0: min = 1, max = 

In [17]:
# mixture
import json
import os

def analyze_param_combinations(base_dir):
    results = {}
    
    # Walk through all query_id directories
    for query_id in os.listdir(base_dir):
        file_path = os.path.join(base_dir, query_id, f"{query_id}_mixture_test.json")
        
        if not os.path.exists(file_path):
            continue
            
        with open(file_path, 'r') as f:
            data = json.load(f)
            
        # Get params directly
        param_combinations = data[query_id].get("params", [])
        
        results[query_id] = []
        
        if not param_combinations:  # if list is empty
            continue
            
        # find the first combination
        first_combination = param_combinations[0]
        
        # test whether the param is an integer
        for idx, element in enumerate(first_combination):
            try:
                # try for integer
                int(element)
                
                # get all values
                values = []
                for combination in param_combinations:
                    try:
                        values.append(int(combination[idx]))
                    except (ValueError, IndexError):
                        continue
                        
                if values:  # find min & max
                    results[query_id].append({
                        'position': idx,
                        'min_value': min(values),
                        'max_value': max(values)
                    })
                    
            except ValueError:
                continue
    
    # print result
    query_ids = sorted(results.keys())
    for query_id in query_ids:
        data = results[query_id]
        print(f"\nQuery ID: {query_id}")
        for result in data:
            print(f"    Position {result['position']}: "
                  f"min = {result['min_value']}, "
                  f"max = {result['max_value']}")

# Usage
analyze_param_combinations("0_mixture_test")


Query ID: 013

Query ID: 018
    Position 2: min = 1998, max = 2003
    Position 3: min = 1, max = 12
    Position 5: min = 1, max = 100
    Position 6: min = 6, max = 100

Query ID: 019
    Position 1: min = 1998, max = 2003
    Position 2: min = 1, max = 12
    Position 4: min = 1, max = 12
    Position 5: min = 1, max = 94
    Position 6: min = 16, max = 100

Query ID: 025
    Position 0: min = 1, max = 12
    Position 1: min = 1998, max = 2003
    Position 2: min = 1, max = 10
    Position 3: min = 1998, max = 2004
    Position 4: min = 1, max = 12
    Position 5: min = 1998, max = 2003

Query ID: 027
    Position 3: min = 1998, max = 2003

Query ID: 040
    Position 2: min = 1, max = 77
    Position 3: min = 23, max = 100
    Position 4: min = 1, max = 100
    Position 5: min = 11, max = 119
    Position 6: min = 1, max = 36

Query ID: 050
    Position 0: min = 1, max = 12
    Position 1: min = 0, max = 6

Query ID: 072
    Position 1: min = 1998, max = 2002
    Position 3: min =