### Check workload zero rows return (valid)

#### check original workload valid

In [1]:
import os
import pandas as pd
from glob import glob
import csv

def check_zero_row_results(base_path):
    zero_rows = []
    
    query_dirs = glob(os.path.join(base_path, "0_original_testing_time_check", "*"))
    
    for query_dir in sorted(query_dirs):
        query_id = os.path.basename(query_dir)
        
        methods = ['cardinality', 'csv', 'kepler']
        phases = ['train', 'test']
        
        for method in methods:
            for phase in phases:
                file_path = os.path.join(
                    base_path,
                    "0_original_testing_time_check",
                    query_id,
                    f"{method}_{phase}_result.csv"
                )
                        
                if os.path.exists(file_path):
                    try:
                        df = pd.read_csv(file_path)
                        columns_to_check = ['Row Count']
                        zero_counts = {}
                        for col in columns_to_check:
                            if col in df.columns:
                                zero_counts[col] = (df[col] == 0).sum()
                        
                        if any(count > 0 for count in zero_counts.values()):
                            zero_rows.append({
                                'query_id': query_id,
                                'method': method,
                                'phase': phase,
                                'file_path': file_path,
                                'Row_Count_Zeros': zero_counts.get('Row Count', 0)
                            })
                    except Exception as e:
                        print(f"Error reading file {file_path}: {str(e)}")
    
    if zero_rows:
        output_file = os.path.join(base_path, "0_original_testing_time_check", "zero_return.csv")
        with open(output_file, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'query_id', 'method', 
                'phase', 'file_path','Row_Count_Zeros'
            ])
            writer.writeheader()
            writer.writerows(zero_rows)
        print(f"Found {len(zero_rows)} files with zero rows. Results saved to {output_file}")
        
        unique_query_ids = sorted(set(row['query_id'] for row in zero_rows))
        print("ERROR query IDs:", unique_query_ids)
    else:
        print("No files with zero rows found.")

def main():
    base_path = "."
    check_zero_row_results(base_path)

if __name__ == "__main__":
    main()

No files with zero rows found.


#### check sample workload valid

In [2]:
import os
import pandas as pd
from glob import glob
import csv

def check_zero_row_results(base_path):
    zero_rows = []
    
    query_dirs = glob(os.path.join(base_path, "0_sample_testing_time_check", "*"))
    
    for query_dir in sorted(query_dirs):
        query_id = os.path.basename(query_dir)
        
        robustness_methods = ['category', 'random', 'sliding']
        methods = ['cardinality', 'csv', 'kepler']
        instances = [1, 4]
        phases = ['train', 'test']
        
        for rob in robustness_methods:
            for method in methods:
                for i in instances:
                    for phase in phases:
                        file_path = os.path.join(
                            base_path,
                            "0_sample_testing_time_check",
                            query_id,
                            f"{rob}_{method}_db{i}_{phase}_result.csv"
                        )
                        
                        if os.path.exists(file_path):
                            try:
                                df = pd.read_csv(file_path)
                                columns_to_check = ['Row Count']
                                zero_counts = {}
                                for col in columns_to_check:
                                    if col in df.columns:
                                        zero_counts[col] = (df[col] == 0).sum()
                                
                                if any(count > 0 for count in zero_counts.values()):
                                    zero_rows.append({
                                        'query_id': query_id,
                                        'robustness': rob,
                                        'method': method,
                                        'db_instance': i,
                                        'phase': phase,
                                        'file_path': file_path,
                                        'Row_Count_Zeros': zero_counts.get('Row Count', 0)
                                    })
                            except Exception as e:
                                print(f"Error reading file {file_path}: {str(e)}")
    
    if zero_rows:
        output_file = os.path.join(base_path, "0_sample_testing_time_check", "zero_return.csv")
        with open(output_file, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'query_id', 'robustness', 'method', 'db_instance', 
                'phase', 'file_path','Row_Count_Zeros'
            ])
            writer.writeheader()
            writer.writerows(zero_rows)
        print(f"Found {len(zero_rows)} files with zero rows. Results saved to {output_file}")
        
        unique_query_ids = sorted(set(row['query_id'] for row in zero_rows))
        print("ERROR query IDs:", unique_query_ids)
    else:
        print("No files with zero rows found.")

def main():
    base_path = "."
    check_zero_row_results(base_path)

if __name__ == "__main__":
    main()

No files with zero rows found.


#### sample workload generate check

In [3]:
import os
import pandas as pd
from glob import glob
import csv
from collections import defaultdict

def find_all_query_ids(base_path):
    """
    Automatically detect all query IDs from the directory structure
    
    Args:
        base_path: Base directory path where all sample directories are located
    
    Returns:
        set: Set of unique query IDs found in the directory structure
    """
    pattern = os.path.join(base_path, "0_sample_repo/imdb_*_sample")
    matching_dirs = glob(pattern)
    
    query_ids = set()
    for dir_path in matching_dirs:
        dir_name = os.path.basename(dir_path)
        try:
            query_id = dir_name.split('_')[1]
            query_ids.add(query_id)
        except IndexError:
            print(f"Warning: Could not extract query ID from directory: {dir_path}")
    
    return query_ids

def extract_params_from_csv(file_path, param_keys):
    """
    Extract specified parameters from a CSV file
    
    Args:
        file_path: Path to the CSV file
        param_keys: List of parameter names to extract
    
    Returns:
        dict: Dictionary containing parameter names and their corresponding values
    """
    try:
        df = pd.read_csv(file_path)
        results = {}
        for param in param_keys:
            value = df.loc[df['Key'] == param, 'Value'].iloc[0] if not df.loc[df['Key'] == param, 'Value'].empty else None
            results[param] = value
        return results
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def process_all_files(base_path, query_id):
    """
    Process specific CSV files and collect both found and missing files
    
    Args:
        base_path: Base directory path
        query_id: Query ID to match in file paths
    
    Returns:
        tuple: (list of found file results, list of missing file records)
    """
    params_to_extract = [
        'original_testing_params',
        'original_50_training_params',
        'distinct_testing_params',
        'distinct_50_training_params'
    ]
    
    robustness_methods = ['category', 'random', 'sliding']
    instance_numbers = [1, 4]
    # methods = ['cardinality', 'csv', 'kepler', 'cardinality_full']
    methods = ['cardinality', 'csv', 'kepler']
    
    results = []
    missing_files = []
    
    for rob_method in robustness_methods:
        for i in instance_numbers:
            for method in methods:
                file_path = os.path.join(
                    base_path,
                    "0_sample_repo",
                    f"imdb_{query_id}_sample",
                    rob_method,
                    f"db_instance_{i}",
                    method,
                    "inputs",
                    "metadata",
                    f"{query_id}.csv"
                )
                
                if os.path.exists(file_path):
                    try:
                        params = extract_params_from_csv(file_path, params_to_extract)
                        if params:
                            record = {
                                'query_id': query_id,
                                'robustness_method': rob_method,
                                'instance': i,
                                'method': method,
                                **params
                            }
                            results.append(record)
                    except Exception as e:
                        print(f"Error processing file {file_path}: {str(e)}")
                else:
                    missing_files.append({
                        'Query ID': query_id,
                        'Robustness Method': rob_method,
                        'Instance': i,
                        'Method': method
                    })
    
    return results, missing_files

def save_results_to_csv(all_results, output_file):
    """
    Save all results to a single CSV file
    """
    header = ['Query ID', 'Robustness Method', 'Instance', 'Method',
              'original_testing_params',
              'original_50_training_params',
              'distinct_testing_params',
              'distinct_50_training_params']
    
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore')
        writer.writeheader()
        
        for record in all_results:
            row = {
                'Query ID': record['query_id'],
                'Robustness Method': record['robustness_method'],
                'Instance': record['instance'],
                'Method': record['method'],
                'original_testing_params': record.get('original_testing_params', ''),
                'original_50_training_params': record.get('original_50_training_params', ''),
                'distinct_testing_params': record.get('distinct_testing_params', ''),
                'distinct_50_training_params': record.get('distinct_50_training_params', '')
            }
            writer.writerow(row)

def save_missing_files_to_csv(missing_files, output_file):
    """
    Save missing file information to a CSV file
    
    Args:
        missing_files: List of dictionaries containing missing file information
        output_file: Path to the output CSV file
    """
    header = ['Query ID', 'Robustness Method', 'Instance', 'Method']
    
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        writer.writerows(missing_files)

def main():
    """
    Main function that processes all query IDs and saves results to CSV files
    """
    base_path = "."
    results_output_file = "0_sample_repo/all_params_results.csv"
    missing_files_output = "0_sample_repo/missing_files.csv"
    
    all_results = []
    all_missing_files = []
    
    query_ids = find_all_query_ids(base_path)
    print(f"Found {len(query_ids)} query IDs: {sorted(query_ids)}")
    
    for query_id in sorted(query_ids):
        results, missing_files = process_all_files(base_path, query_id)
        all_results.extend(results)
        all_missing_files.extend(missing_files)
    
    # Save results
    save_results_to_csv(all_results, results_output_file)
    save_missing_files_to_csv(all_missing_files, missing_files_output)
    print(f"Results saved to {results_output_file}")
    print(f"Missing files information saved to {missing_files_output}")
    
    try:
        # Check for missing 50 training parameters
        df = pd.read_csv(results_output_file)
        missing_50 = df[df['original_50_training_params'] < 50]
        
        if len(missing_50) > 0:
            missing_50_output = "0_sample_repo/missing_50_params.csv"
            missing_50.to_csv(missing_50_output, index=False)
            print(f"Found {len(missing_50)} rows with missing 50 training parameters, saved to {missing_50_output}")
    except Exception as e:
        print(f"Error while analyzing CSV: {str(e)}")
    
    missing_query_ids = set(record['Query ID'] for record in all_missing_files)
    non_missing_query_ids = sorted([query_id for query_id in query_ids if query_id not in missing_query_ids])
    print(f"{len(non_missing_query_ids)} Non-missing query_ids: {non_missing_query_ids}")

if __name__ == "__main__":
    main()

Found 32 query IDs: ['1-0', '10-0', '11-0', '12-0', '13-0', '14-0', '15-0', '16-0', '17-0', '18-0', '19-0', '2-0', '20-0', '21-0', '22-0', '23-0', '24-0', '25-0', '26-0', '27-0', '28-0', '3-0', '30-0', '31-0', '32-0', '33-0', '4-0', '5-0', '6-0', '7-0', '8-0', '9-0']
Results saved to 0_sample_repo/all_params_results.csv
Missing files information saved to 0_sample_repo/missing_files.csv
Found 11 rows with missing 50 training parameters, saved to 0_sample_repo/missing_50_params.csv
31 Non-missing query_ids: ['1-0', '10-0', '11-0', '12-0', '13-0', '14-0', '15-0', '17-0', '18-0', '19-0', '2-0', '20-0', '21-0', '22-0', '23-0', '24-0', '25-0', '26-0', '27-0', '28-0', '3-0', '30-0', '31-0', '32-0', '33-0', '4-0', '5-0', '6-0', '7-0', '8-0', '9-0']


#### check sample training

In [4]:
import os

query_ids = [f"{i}-0" for i in range(1, 34) if i not in {16, 29}]
# robustness_types = ['category', 'sliding', 'random', 'cardinality_full']
robustness_types = ['category', 'sliding', 'random']
instance_ids = [1, 4]
methods = ['cardinality', 'csv', 'kepler']

print("Missing")
for query_id in query_ids:
    for robustness in robustness_types:
        for i in instance_ids:
            for method in methods:
                file_path = f"0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{i}/{method}/outputs/hints/{query_id}/training_50/candidate_metadata.json"
                exists = os.path.exists(file_path)
                if not exists:
                    print(f"{query_id} {robustness} db{i} {method}")

Missing
28-0 sliding db1 cardinality
28-0 sliding db1 csv
28-0 sliding db1 kepler
28-0 sliding db4 cardinality
28-0 sliding db4 csv
28-0 sliding db4 kepler


#### check sample robust plan

Empty results:
1. 13-0,category,4,csv
2. 14-0,category,4,csv
3. 15-0,sliding,1,csv
4. 16-0,category,1,csv
5. 16-0,category,4,csv
6. 19-0,sliding,1,csv
7. 20-0,category,4,csv
8. 23-0,sliding,1,csv
9. 24-0,sliding,1,csv
10. 26-0,category,4,csv
11. 33-0,category,4,csv

In [5]:
import os

query_ids = ['17-0', '14-0', '3-0', '4-0', '30-0', '19-0']
robustness_types = ['category', 'random', 'sliding']
instance_ids = [1, 4]
methods = ['cardinality', 'csv', 'kepler']

print("Missing")
for query_id in query_ids:
    for robustness in robustness_types:
        for i in instance_ids:
            for method in methods:
                file_path = f"0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{i}/{method}/outputs/results/{query_id}/training_50/metadata.json"
                exists = os.path.exists(file_path)
                if not exists:
                    print(f"{query_id} {robustness} db{i} {method}")

Missing
14-0 category db4 csv
19-0 category db1 cardinality
19-0 category db1 csv
19-0 category db1 kepler
19-0 category db4 cardinality
19-0 category db4 csv
19-0 category db4 kepler
19-0 random db1 cardinality
19-0 random db1 csv
19-0 random db1 kepler
19-0 random db4 cardinality
19-0 random db4 csv
19-0 random db4 kepler
19-0 sliding db1 cardinality
19-0 sliding db1 csv
19-0 sliding db1 kepler
19-0 sliding db4 cardinality
19-0 sliding db4 csv
19-0 sliding db4 kepler


#### robustness plan statistics (robust plan length & test set plan length)

In [5]:
import json
import os
import pandas as pd
from pathlib import Path

def process_json_files(query_ids, methods, training_sizes, robustness_types, instance_ids):
    # Initialize list to store results
    results = []
    
    for robustness in robustness_types:
        for db_i in instance_ids:
            for query_id in query_ids:
                for method in methods:
                    for training_size in training_sizes:
                        # Construct file path
                        file_path = f"0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{db_i}/{method}/outputs/results/{query_id}/training_{training_size}/execution_output/imdbloadbase_{query_id}_metadata.json"
                        
                        # Check if file exists
                        if os.path.exists(file_path):
                            try:
                                # Read and parse JSON file
                                with open(file_path, 'r') as f:
                                    data = json.load(f)
                                    
                                # Extract plan_cover length
                                plan_cover_length = len(data[str(query_id)]["plan_cover"])
                                
                                # Add result to list
                                results.append({
                                    'query-id': query_id,
                                    "robustness": robustness,
                                    "db_instance": db_i,
                                    'method': method,
                                    'training_size': training_size,
                                    'plan_cover_length': plan_cover_length
                                })
                            except (json.JSONDecodeError, KeyError, TypeError) as e:
                                print(f"Error processing file {file_path}: {str(e)}")
                        else:
                            print(f"File not found: {file_path}")
            
    # Create DataFrame and save to CSV
    if results:
        df = pd.DataFrame(results)
        output_dir = "0_sample_analysis"
        os.makedirs(output_dir, exist_ok=True)
        
        output_file = f"{output_dir}/plan_cover_lengths.csv"
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No valid files were found to process")

# Configuration
query_ids = ['17-0', '14-0', '3-0', '4-0']
robustness_types = ['category', 'random', 'sliding']
instance_ids = [1, 4]
methods = ['cardinality', 'kepler', 'csv']
training_sizes = [50]

process_json_files(query_ids, methods, training_sizes, robustness_types, instance_ids)

Error processing file 0_sample_repo/imdb_14-0_sample/category/db_instance_4/csv/outputs/results/14-0/training_50/execution_output/imdbloadbase_14-0_metadata.json: 'plan_cover'
Results saved to 0_sample_analysis/plan_cover_lengths.csv


In [8]:
import os
import re
import pandas as pd
from pathlib import Path

def analyze_confidence_ranges(query_id):
    robustness_types = ['category', 'random', 'sliding']
    instance_ids = [1, 4]
    methods = ['cardinality', 'kepler', 'csv']
    training_sizes = [50]
    results = []
    
    for robustness in robustness_types:
        for db_i in instance_ids:
            for method in methods:
                for training_size in training_sizes:
                    base_path = f'0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{db_i}/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_0/predictions/'
                    
                    if not os.path.exists(base_path):
                        print(f"Warning: Directory not found: {base_path}")
                        continue
                        
                    pattern = re.compile(f"{query_id}_.*_batch_0\.csv$")
                    matching_files = []
                    
                    for file in os.listdir(base_path):
                        if pattern.match(file):
                            matching_files.append(os.path.join(base_path, file))
                    
                    if not matching_files:
                        print(f"Warning: No matching files found for {method} - training_{training_size}")
                        continue
                    
                    all_confidences = []
                    for file in matching_files:
                        try:
                            df = pd.read_csv(file)
                            if 'confidence' in df.columns:
                                all_confidences.extend(df['confidence'].tolist())
                            else:
                                print(f"Warning: No confidence column in {file}")
                        except Exception as e:
                            print(f"Error reading file {file}: {e}")
                            continue
                    
                    if all_confidences:
                        min_conf = min(all_confidences)
                        max_conf = max(all_confidences)
                        
                        results.append({
                            'query_id': query_id,
                            "robustness": robustness,
                            "db_instance": db_i,
                            'method': method,
                            'training_size': training_size,
                            'min_confidence': min_conf,
                            'max_confidence': max_conf
                        })
                    
    return results

if __name__ == "__main__":
    query_ids = ['17-0', '14-0', '3-0', '4-0']
    all_results = []
    
    for query_id in query_ids:
        all_results.extend(analyze_confidence_ranges(query_id))
    
    results_df = pd.DataFrame(all_results)
    output_dir = '0_sample_analysis'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'confidence_range.csv')
    results_df.to_csv(output_file, index=False, mode='w')
    print(f"Results saved to {output_file}")

Results saved to 0_sample_analysis/confidence_range.csv


In [7]:
import os
import pandas as pd
import json
from collections import Counter, defaultdict

def analyze_workload_files(directory):
    results = defaultdict(lambda: defaultdict(dict))
    
    for filename in os.listdir(directory):
        if filename.endswith('training_size_50.csv'):
            query_id = filename.split('_')[0]
            robustness = filename.split('_')[1]
            db_i = filename.split('_')[2]
            method = filename.split('_')[3]
            
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            plan_counter = Counter(df['plan_id'])
            results[query_id][f"{robustness}_{db_i}_{method}"] = dict(plan_counter)
    
    return dict(results)

def main():
    # sample plans
    input_directory = '0_sample_plans_0'
    output_file = '0_sample_analysis/sample_plan_statistics.json'
    result_dict = analyze_workload_files(input_directory)
    
    with open(output_file, 'w') as f:
        json.dump(result_dict, f, indent=2)

if __name__ == "__main__":
    main()

### Generate training data

#### get training metadata (length of robust plan set)

In [1]:
import json
import os
import pandas as pd
from pathlib import Path

def process_json_files(query_ids, methods, training_sizes):
    # Initialize list to store results
    results = []
    
    for query_id in query_ids:
        for method in methods:
            for training_size in training_sizes:
                # Construct file path
                file_path = f"0_finished_repo/imdb_{query_id}_original/{method}/outputs/results/{query_id}/training_{training_size}/execution_output/imdbloadbase_{query_id}_metadata.json"
                
                # Check if file exists
                if os.path.exists(file_path):
                    try:
                        # Read and parse JSON file
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            
                        # Extract plan_cover length
                        plan_cover_length = len(data[str(query_id)]["plan_cover"])
                        
                        # Add result to list
                        results.append({
                            'query-id': query_id,
                            'method': method,
                            'training_size': training_size,
                            'plan_cover_length': plan_cover_length
                        })
                    except (json.JSONDecodeError, KeyError, TypeError) as e:
                        print(f"Error processing file {file_path}: {str(e)}")
                else:
                    print(f"File not found: {file_path}")
    
    # Create DataFrame and save to CSV
    if results:
        df = pd.DataFrame(results)
        output_dir = "0_original_analysis"
        os.makedirs(output_dir, exist_ok=True)
        
        output_file = f"{output_dir}/plan_cover_lengths.csv"
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No valid files were found to process")

# Configuration
methods = ['cardinality', 'kepler', 'csv']
training_sizes = [50, 400]

query_ids = [f'{i}-0' for i in range(1, 34) if i != 29]
process_json_files(query_ids, methods, training_sizes)

File not found: 0_finished_repo/imdb_3-0_original/kepler/outputs/results/3-0/training_400/execution_output/imdbloadbase_3-0_metadata.json
Error processing file 0_finished_repo/imdb_4-0_original/cardinality/outputs/results/4-0/training_400/execution_output/imdbloadbase_4-0_metadata.json: 'plan_cover'
File not found: 0_finished_repo/imdb_4-0_original/kepler/outputs/results/4-0/training_400/execution_output/imdbloadbase_4-0_metadata.json
File not found: 0_finished_repo/imdb_4-0_original/csv/outputs/results/4-0/training_400/execution_output/imdbloadbase_4-0_metadata.json
File not found: 0_finished_repo/imdb_6-0_original/cardinality/outputs/results/6-0/training_400/execution_output/imdbloadbase_6-0_metadata.json
File not found: 0_finished_repo/imdb_6-0_original/kepler/outputs/results/6-0/training_400/execution_output/imdbloadbase_6-0_metadata.json
File not found: 0_finished_repo/imdb_6-0_original/csv/outputs/results/6-0/training_400/execution_output/imdbloadbase_6-0_metadata.json
File not f

#### check training directories

In [14]:
import os
import glob

def check_training_directories(base_dir="0_finished_repo"):
    """
    Check if training_50 and training_400 directories exist in hints folder
    for each query and method
    
    Parameters:
    base_dir (str): Base directory to start search from
    
    Returns:
    dict: Results showing missing directories for each query and method
    """
    # Store results
    missing_dirs = {}
    
    # Find all imdb_*_original directories
    query_dirs = glob.glob(os.path.join(base_dir, "imdb_*_original"))
    # print(query_dirs)
    
    for query_dir in query_dirs:
        # Extract query_id from directory name
        query_id = query_dir.split("_")[3]
        
        # Check each method directory
        method_dirs = os.listdir(query_dir)
        for method in method_dirs:
            # Construct paths to check
            hints_path = os.path.join(query_dir, method, "outputs", "results", query_id)
            training_50_path = os.path.join(hints_path, "training_50", "metadata.json")
            # training_400_path = os.path.join(hints_path, "training_400", "metadata.json")
            
            # Check if paths exist
            paths_status = {
                "training_50": not os.path.exists(training_50_path)
                # "training_400": not os.path.exists(training_400_path)
            }
            
            # If either directory is missing, add to results
            if any(paths_status.values()):
                if query_id not in missing_dirs:
                    missing_dirs[query_id] = {}
                
                missing_dirs[query_id][method] = [
                    name for name, is_missing in paths_status.items() if is_missing
                ]

    return missing_dirs

def print_missing_dirs(results):
    """
    Print only cases where training directories are missing
    
    Parameters:
    results (dict): Results from check_training_directories
    """
    if not results:
        print("\nAll required training directories exist.")
        return
    
    # finished = ['1-0', '2-0', '5-0', '7-0', '16-0', '18-0']
    # missing = sorted(list(set(results.keys()) - set(finished)))
    
    print("\nMissing training directories:", results)
    # print("="*50)
    
    for query_id, methods in sorted(results.items()):
        for method, missing in methods.items():
            missing_dirs_str = ", ".join(missing)
            if method in ['cardinality', 'csv', 'kepler']:
                print(f"Query {query_id} - Method {method}: Missing {missing_dirs_str}")

def main():
    # Check current directory
    results = check_training_directories()
    print_missing_dirs(results)
    
    # Print total count of issues
    total_issues = sum(
        len(methods) for methods in results.values()
    )
  
if __name__ == "__main__":
    main()


All required training directories exist.


#### original: check the min-max confidence for each query_id

In [4]:
import os
import re
import pandas as pd
from pathlib import Path

def analyze_confidence_ranges(query_id):
    methods = ['cardinality', 'csv', 'kepler']
    training_sizes = ['50']
    results = []
    
    for method in methods:
        for training_size in training_sizes:
            base_path = f'imdb_{query_id}_original/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_0/predictions/'
            
            if not os.path.exists(base_path):
                print(f"Warning: Directory not found: {base_path}")
                continue
                
            pattern = re.compile(f"{query_id}_.*_batch_0\.csv$")
            matching_files = []
            
            for file in os.listdir(base_path):
                if pattern.match(file):
                    matching_files.append(os.path.join(base_path, file))
            
            if not matching_files:
                print(f"Warning: No matching files found for {method} - training_{training_size}")
                continue
            
            all_confidences = []
            for file in matching_files:
                try:
                    df = pd.read_csv(file)
                    if 'confidence' in df.columns:
                        all_confidences.extend(df['confidence'].tolist())
                    else:
                        print(f"Warning: No confidence column in {file}")
                except Exception as e:
                    print(f"Error reading file {file}: {e}")
                    continue
            
            if all_confidences:
                min_conf = min(all_confidences)
                max_conf = max(all_confidences)
                
                results.append({
                    'query_id': query_id,
                    'method': method,
                    'training_size': training_size,
                    'min_confidence': min_conf,
                    'max_confidence': max_conf
                })
            
    return results

if __name__ == "__main__":
    query_ids = ['1-0', '2-0', '3-0', '4-0', '5-0', '6-0', '7-0', '8-0', '9-0', '10-0',
             '11-0', '12-0', '13-0', '14-0', '15-0', '16-0', '17-0', '18-0', '19-0', '20-0',
             '21-0', '22-0', '23-0', '24-0', '25-0', '26-0', '27-0', '28-0', '30-0',
             '31-0', '32-0', '33-0']
    all_results = []
    
    for query_id in query_ids:
        all_results.extend(analyze_confidence_ranges(query_id))
    
    results_df = pd.DataFrame(all_results)
    output_dir = '0_original_analysis'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'confidence_range.csv')
    results_df.to_csv(output_file, index=False, mode='w')
    print(f"Results saved to {output_file}")

Results saved to 0_original_analysis/confidence_range.csv


#### original & mixture unique plan count

In [5]:
import os
import pandas as pd
import json
from collections import Counter, defaultdict

def analyze_workload_files(directory):
    results = defaultdict(lambda: defaultdict(dict))
    
    for filename in os.listdir(directory):
        if filename.endswith('training_size_50.csv') and 'workload' in filename:
            query_id = filename.split('_')[0]
            method = filename.split('_')[2]
            
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            plan_counter = Counter(df['plan_id'])
            results[query_id][method] = dict(plan_counter)
    
    return dict(results)

def main():
    # mixture plans
    input_directory = '0_mixture_plans'
    output_file = '0_original_analysis/mixture_plan_statistics.json'
    result_dict = analyze_workload_files(input_directory)
    
    with open(output_file, 'w') as f:
        json.dump(result_dict, f, indent=2)
    
    # original plans
    input_directory = '0_original_plans_0'
    output_file = '0_original_analysis/original_plan_statistics.json'
    result_dict = analyze_workload_files(input_directory)
    
    with open(output_file, 'w') as f:
        json.dump(result_dict, f, indent=2)

if __name__ == "__main__":
    main()

#### original training time execution

In [None]:
import os
import json
import csv
from pathlib import Path

def process_training_times():
    base_dir = "0_finished_repo"
    output_dir = "0_original_analysis"
    
    os.makedirs(output_dir, exist_ok=True)
    results = []
    
    for dir_name in os.listdir(base_dir):
        if dir_name.startswith("imdb_"):
            query_id = dir_name.split("_")[1]
            base_path = Path(base_dir) / dir_name
            
            for method in ['cardinality', 'kepler', 'csv']:
                method_path = base_path / method
                
                try:
                    # get candidate plan generation time
                    candidate_metadata_path = method_path / "outputs" / "hints" / query_id / "training_50" / "candidate_metadata.json"
                    with open(candidate_metadata_path) as f:
                        candidate_data = json.load(f)
                        candidate_time = candidate_data["candidate_plan_generation_time_seconds"] / 60
                    
                    # get training data generation time
                    training_metadata_path = method_path / "outputs" / "results" / query_id / "training_50" / "metadata.json"
                    with open(training_metadata_path) as f:
                        training_data = json.load(f)
                        training_time = training_data["training_data_all_time_seconds"] / 60
                    
                    # get model prediction time
                    eval_metadata_path = method_path / "outputs" / "evaluation" / query_id / "training_50" / "confidence_0" / "metadata.json"
                    with open(eval_metadata_path) as f:
                        eval_data = json.load(f)
                        prediction_time = eval_data["model_prediction_time"]
                    
                    # get training & testing time
                    model_metadata_path = method_path / "outputs" / "evaluation_single" / query_id / "training_50" / "confidence_0" / "metadata.json"
                    with open(model_metadata_path) as f:
                        model_data = json.load(f)
                        model_train_time = model_data["model_train_time"]
                        model_test_time = model_data["model_predict_time (already * 200)"]
                    
                    # add result
                    results.append({
                        "query_id": query_id,
                        "method": method,
                        "candidate_time": round(candidate_time, 2),
                        "training_time": round(training_time, 2),
                        "prediction_time": round(prediction_time, 2),
                        "model_train_time": round(model_train_time, 2),
                        "model_predict_time": round(model_test_time, 2)
                    })
                    
                except FileNotFoundError as e:
                    print(f"ERROR: {e}")
                except json.JSONDecodeError as e:
                    print(f"ERROR: {e}")
                except KeyError as e:
                    print(f"ERROR: {e}")
    
    # training_time.csv
    output_file = Path(output_dir) / "training_time.csv"
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "query_id",
            "method",
            "candidate plan generation time (mins)",
            "robust plan set generation (mins)",
            "model prediction time (ms)",
            "model train time (ms)",
            "model predict time (ms)"
        ])
        
        writer.writeheader()
        for row in sorted(results, key=lambda x: (x["query_id"])):
            writer.writerow({
                "query_id": row["query_id"],
                "method": row["method"],
                "candidate plan generation time (mins)": row["candidate_time"],
                "robust plan set generation (mins)": row["training_time"],
                "model prediction time (ms)": row["prediction_time"],
                "model train time (ms)": row["model_train_time"],
                "model predict time (ms)": row["model_predict_time"]
            })

if __name__ == "__main__":
    process_training_times()

In [2]:
import os
import json
import csv
from pathlib import Path

def process_training_times():
    base_dir = "0_finished_repo"
    output_dir = "0_original_analysis"
    
    os.makedirs(output_dir, exist_ok=True)
    results = []
    
    for dir_name in os.listdir(base_dir):
        if dir_name.startswith("imdb_"):
            query_id = dir_name.split("_")[1]
            if query_id not in ['1-0', '2-0', '3-0', '5-0', '7-0']:
                continue
            
            base_path = Path(base_dir) / dir_name
            
            for method in ['cardinality', 'kepler', 'csv']:
                method_path = base_path / method
                
                try:
                    # get candidate plan generation time
                    candidate_metadata_path = method_path / "outputs" / "hints" / query_id / "training_400" / "candidate_metadata.json"
                    with open(candidate_metadata_path) as f:
                        candidate_data = json.load(f)
                        candidate_time = candidate_data["candidate_plan_generation_time_seconds"] / 60
                    
                    # get training data generation time
                    training_metadata_path = method_path / "outputs" / "results" / query_id / "training_400" / "metadata.json"
                    with open(training_metadata_path) as f:
                        training_data = json.load(f)
                        training_time = training_data["training_data_all_time_seconds"] / 60
                    
                    # get model prediction time
                    eval_metadata_path = method_path / "outputs" / "evaluation" / query_id / "training_400" / "confidence_0" / "metadata.json"
                    with open(eval_metadata_path) as f:
                        eval_data = json.load(f)
                        prediction_time = eval_data["model_prediction_time"]
                    
                    # add result
                    results.append({
                        "query_id": query_id,
                        "method": method,
                        "candidate_time": round(candidate_time, 2),
                        "training_time": round(training_time, 2),
                        "prediction_time": round(prediction_time, 2)
                    })
                    
                except FileNotFoundError as e:
                    print(f"ERROR: {e}")
                except json.JSONDecodeError as e:
                    print(f"ERROR: {e}")
                except KeyError as e:
                    print(f"ERROR: {e}")
    
    # training_time.csv
    output_file = Path(output_dir) / "training_time_400.csv"
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "query_id",
            "method",
            "candidate plan generation time (mins)",
            "robust plan set generation (mins)",
            "model prediction time (ms)"
        ])
        
        writer.writeheader()
        for row in sorted(results, key=lambda x: (x["query_id"])):
            writer.writerow({
                "query_id": row["query_id"],
                "method": row["method"],
                "candidate plan generation time (mins)": row["candidate_time"],
                "robust plan set generation (mins)": row["training_time"],
                "model prediction time (ms)": row["prediction_time"]
            })

if __name__ == "__main__":
    process_training_times()

ERROR: [Errno 2] No such file or directory: '0_finished_repo/imdb_3-0_original/kepler/outputs/results/3-0/training_400/metadata.json'
ERROR: [Errno 2] No such file or directory: '0_finished_repo/imdb_3-0_original/csv/outputs/evaluation/3-0/training_400/confidence_0/metadata.json'


#### sample training time execution

In [6]:
import os
import json
import csv
from pathlib import Path

def process_training_times():
    base_dir = "0_sample_repo"
    output_dir = "0_sample_analysis"
    
    os.makedirs(output_dir, exist_ok=True)
    results = []
    
    for dir_name in os.listdir(base_dir):
        if dir_name.startswith("imdb_"):
            query_id = dir_name.split("_")[1]
            if query_id not in ['3-0', '4-0', '14-0', '17-0']:
                continue
            base_path = Path(base_dir) / dir_name
            
            for robustness in ['category', 'random', 'sliding']:
                for db_i in [1, 4]:
                    for method in ['cardinality', 'kepler', 'csv']:
                        method_path = base_path / robustness / f"db_instance_{db_i}" / method
                        
                        try:
                            # get candidate plan generation time
                            candidate_metadata_path = method_path / "outputs" / "hints" / query_id / "training_50" / "candidate_metadata.json"
                            with open(candidate_metadata_path) as f:
                                candidate_data = json.load(f)
                                candidate_time = candidate_data["candidate_plan_generation_time_seconds"] / 60
                            
                            # get training data generation time
                            training_metadata_path = method_path / "outputs" / "results" / query_id / "training_50" / "metadata.json"
                            with open(training_metadata_path) as f:
                                training_data = json.load(f)
                                training_time = training_data["training_data_all_time_seconds"] / 60
                            
                            # get model prediction time
                            eval_metadata_path = method_path / "outputs" / "evaluation" / query_id / "training_50" / "confidence_0" / "metadata.json"
                            with open(eval_metadata_path) as f:
                                eval_data = json.load(f)
                                prediction_time = eval_data["model_prediction_time"]
                            
                            # add result
                            results.append({
                                "query_id": query_id,
                                "robustness": robustness,
                                "db_instance": db_i,
                                "method": method,
                                "candidate_time": round(candidate_time, 2),
                                "training_time": round(training_time, 2),
                                "prediction_time": round(prediction_time, 2)
                            })
                            
                        except FileNotFoundError as e:
                            print(f"ERROR: {e}")
                        except json.JSONDecodeError as e:
                            print(f"ERROR: {e}")
                        except KeyError as e:
                            print(f"ERROR: {e}")
            
    # training_time.csv
    output_file = Path(output_dir) / "training_time.csv"
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "query_id",
            "robustness",
            "db_instance",
            "method",
            "candidate plan generation time (mins)",
            "robust plan set generation (mins)",
            "model prediction time (ms)"
        ])
        
        writer.writeheader()
        for row in sorted(results, key=lambda x: (x["query_id"])):
            writer.writerow({
                "query_id": row["query_id"],
                "robustness": row["robustness"],
                "db_instance": row["db_instance"],
                "method": row["method"],
                "candidate plan generation time (mins)": row["candidate_time"],
                "robust plan set generation (mins)": row["training_time"],
                "model prediction time (ms)": row["prediction_time"]
            })

if __name__ == "__main__":
    process_training_times()

ERROR: [Errno 2] No such file or directory: '0_sample_repo/imdb_14-0_sample/category/db_instance_4/csv/outputs/results/14-0/training_50/metadata.json'


### Generate plans

#### sample plans conclusion

In [4]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, training_size, confidence_threshold):
    """
    Process data for a given query_id and training_size combination.
    
    Args:
        query_id (str): The ID of the query to process
        training_size (int): The training size to process
    """
    # Define the methods to process - now using the new set of methods
    methods = ['cardinality', 'csv', 'kepler']
    robustness_types = ['category', 'random', 'sliding']
    instance_ids = [1, 4]
    
    for robustness in robustness_types:
        for db_i in instance_ids:
            for method in methods:
                # Form the predictions file path using the new structure
                predictions_path = f'0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{db_i}/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_{confidence_threshold}/predictions/'
                
                # Find the file that matches the pattern starting with query_id and ending with _batch_0.csv
                predictions_file = None
                try:
                    for filename in os.listdir(predictions_path):
                        if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                            predictions_file = os.path.join(predictions_path, filename)
                            break
                except FileNotFoundError:
                    print(f"Directory not found for {robustness} db{db_i} {method}, query_id={query_id}, training_size={training_size}")
                    continue
                        
                if predictions_file is None:
                    print(f"No prediction file found for {method}, query_id={query_id}, training_size={training_size}")
                    continue
                
                # Read and process the predictions file
                predictions_df = pd.read_csv(predictions_file)
                
                # Read the testing data JSON
                testing_json_path = f'0_sample_repo/imdb_{query_id}_sample/{robustness}/db_instance_{db_i}/{method}/inputs/testing/{query_id}_testing_original.json'
                try:
                    with open(testing_json_path, 'r') as f:
                        query_data = json.load(f)
                except FileNotFoundError:
                    print(f"Testing JSON not found for query_id={query_id}")
                    continue
                
                # Process each prediction row
                results = []
                
                for _, pred_row in predictions_df.iterrows():
                    params = pred_row['params']
                    plan_id = pred_row['plan_id']
                    plan_content = pred_row['plan_content']
                    
                    # Convert params from string representation to list
                    params_list = eval(params)
                    
                    # Get full query instance
                    query = query_data[query_id]['query']
                    
                    # Replace parameters in query with actual values
                    for i, param in enumerate(params_list):
                        param = str(param).strip()
                        pattern = re.compile(rf"@param{i}\b")
                        query = pattern.sub(param, query)
                    
                    results.append({
                        'query': query,
                        'plan_id': plan_id,
                        'plan_content': plan_content
                    })
                
                # Save results to CSV
                output_df = pd.DataFrame(results)
                os.makedirs(f'0_sample_plans_{confidence_threshold}', exist_ok=True)
                output_filename = f'0_sample_plans_{confidence_threshold}/{query_id}_{robustness}_db{db_i}_{method}_training_size_{training_size}.csv'
                output_df.to_csv(output_filename, index=False)
                print(f"Generated: {output_filename}")

# Define separate lists for query_ids and training sizes
query_ids = ['30-0']
training_sizes = [50]
confidence_thresholds = ["0"]

# Generate all combinations using itertools.product
combinations = list(product(query_ids, training_sizes, confidence_thresholds))

# Display the total number of combinations to be processed
print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, training_size, confidence_threshold in combinations:
    print(f"\nProcessing query_id={query_id}, training_size={training_size}, confidence_threshold={confidence_threshold}")
    process_data(query_id, training_size, confidence_threshold)

Processing 1 combinations...

Processing query_id=30-0, training_size=50, confidence_threshold=0
Generated: 0_sample_plans_0/30-0_category_db1_cardinality_training_size_50.csv
Generated: 0_sample_plans_0/30-0_category_db1_csv_training_size_50.csv
Generated: 0_sample_plans_0/30-0_category_db1_kepler_training_size_50.csv
Generated: 0_sample_plans_0/30-0_category_db4_cardinality_training_size_50.csv
Generated: 0_sample_plans_0/30-0_category_db4_csv_training_size_50.csv
Generated: 0_sample_plans_0/30-0_category_db4_kepler_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db1_cardinality_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db1_csv_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db1_kepler_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db4_cardinality_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db4_csv_training_size_50.csv
Generated: 0_sample_plans_0/30-0_random_db4_kepler_training_size_50.csv
Generated: 0_sample

#### original plans conclusion

In [3]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, training_size, confidence_threshold):
    """
    Process data for a given query_id and training_size combination.
    
    Args:
        query_id (str): The ID of the query to process
        training_size (int): The training size to process
    """
    # Define the methods to process - now using the new set of methods
    # methods = ['cardinality', 'csv', 'kepler']
    methods = ['cardinality']
    
    for method in methods:
        # Form the predictions file path using the new structure
        predictions_path = f'0_finished_repo/imdb_{query_id}_original/{method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_{confidence_threshold}/predictions/'
        
        # Find the file that matches the pattern starting with query_id and ending with _batch_0.csv
        predictions_file = None
        try:
            for filename in os.listdir(predictions_path):
                if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                    predictions_file = os.path.join(predictions_path, filename)
                    break
        except FileNotFoundError:
            print(f"Directory not found for {method}, query_id={query_id}, training_size={training_size}")
            continue
                
        if predictions_file is None:
            print(f"No prediction file found for {method}, query_id={query_id}, training_size={training_size}")
            continue
        
        # Read and process the predictions file
        predictions_df = pd.read_csv(predictions_file)
        
        # Read the testing data JSON
        testing_json_path = f'0_finished_repo/imdb_{query_id}_original/{method}/inputs/testing/{query_id}_testing_original.json'
        try:
            with open(testing_json_path, 'r') as f:
                query_data = json.load(f)
        except FileNotFoundError:
            print(f"Testing JSON not found for query_id={query_id}")
            continue
        
        # Process each prediction row
        results = []
        
        for _, pred_row in predictions_df.iterrows():
            params = pred_row['params']
            plan_id = pred_row['plan_id']
            plan_content = pred_row['plan_content']
            
            # Convert params from string representation to list
            params_list = eval(params)
            
            # Get full query instance
            query = query_data[query_id]['query']
            
            # Replace parameters in query with actual values
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            results.append({
                'query': query,
                'plan_id': plan_id,
                'plan_content': plan_content
            })
        
        # Save results to CSV
        output_df = pd.DataFrame(results)
        os.makedirs(f'0_original_plans_{confidence_threshold}', exist_ok=True)
        output_filename = f'0_original_plans_{confidence_threshold}/{query_id}_workload_{method}_training_size_{training_size}.csv'
        output_df.to_csv(output_filename, index=False)
        print(f"Generated: {output_filename}")

# Define separate lists for query_ids and training sizes
query_ids = ['3-0']
training_sizes = [400]
confidence_thresholds = ["0"]

# Generate all combinations using itertools.product
combinations = list(product(query_ids, training_sizes, confidence_thresholds))

# Display the total number of combinations to be processed
print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, training_size, confidence_threshold in combinations:
    print(f"\nProcessing query_id={query_id}, training_size={training_size}, confidence_threshold={confidence_threshold}")
    process_data(query_id, training_size, confidence_threshold)

Processing 1 combinations...

Processing query_id=3-0, training_size=400, confidence_threshold=0
Generated: 0_original_plans_0/3-0_workload_cardinality_training_size_400.csv


#### workload plans conclusion

In [3]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, test_method, train_method, training_size, confidence_threshold):
    """
    Process and compare data from workload testing and original paths.
    
    Args:
        query_id (str): The ID of the query to process
        test_method (str): Testing method
        train_method (str): Training method
        training_size (int): The training size to process
        confidence_threshold (str): Confidence threshold
    """
    # Define paths for both workload and original data
    workload_path = f'0_workload_testing/{query_id}/{test_method}_test_{train_method}_train/training_{training_size}/confidence_{confidence_threshold}/predictions/'
    original_path = f'imdb_{query_id}_original/{test_method}/outputs/evaluation/{query_id}/training_{training_size}/confidence_{confidence_threshold}/predictions/'
    
    # Find matching files in both directories
    try:
        workload_file = None
        original_file = None
        
        for filename in os.listdir(workload_path):
            if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                workload_file = os.path.join(workload_path, filename)
                break
                
        for filename in os.listdir(original_path):
            if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                original_file = os.path.join(original_path, filename)
                break
                
        if workload_file is None or original_file is None:
            print(f"Files not found for query_id={query_id}")
            return
            
        # Read both CSV files
        workload_df = pd.read_csv(workload_file)
        original_df = pd.read_csv(original_file)
        
        # Query data
        # f'imdb_{query_id}_original/{method}/inputs/testing/{query_id}_testing_original.json'
        query_json_path = f'imdb_{query_id}_original/{test_method}/inputs/testing/{query_id}_testing_original.json'
        
        try:
            with open(query_json_path, 'r') as f:
                query_data = json.load(f)
        except FileNotFoundError:
            print(f"Testing JSON not found for query_id={query_id}")
            return
        
        # Compare params order
        workload_params = workload_df['params'].tolist()
        original_params = original_df['params'].tolist()
        for i, (w_param, o_param) in enumerate(zip(workload_params, original_params)):
            if w_param != o_param:
                print(f"Mismatch at position {i}:")
                print(f"Workload : {w_param}")
                print(f"Original: {o_param}")
                print("---")
                raise ValueError("ERROR")

        
        # Initialize separate result lists for workload and original
        workload_results = []
        original_results = []
        
        # Process workload results
        for _, row in workload_df.iterrows():
            params = row['params']
            params_list = eval(params)
            query = query_data[query_id]['query']
            
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            workload_results.append({
                'query': query,
                'plan_id': row['plan_id'],
                'plan_content': row['plan_content']
            })
            
        # Process original results
        for _, row in original_df.iterrows():
            params = row['params']
            params_list = eval(params)
            query = query_data[query_id]['query']
            
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            original_results.append({
                'query': query,
                'plan_id': row['plan_id'],
                'plan_content': row['plan_content']
            })
        
        # Create output directory if it doesn't exist
        os.makedirs('0_workload_plans', exist_ok=True)
        
        # Save workload results
        workload_output = pd.DataFrame(workload_results)
        workload_filename = f'0_workload_plans/{query_id}_{test_method}_test_{train_method}_train_size_{training_size}.csv'
        workload_output.to_csv(workload_filename, index=False)
        
        # Save original results
        original_output = pd.DataFrame(original_results)
        original_filename = f'0_workload_plans/{query_id}_{test_method}_test_{test_method}_train_size_{training_size}.csv'
        original_output.to_csv(original_filename, index=False)
        
        print(f"Generated: {workload_filename}")
        print(f"Generated: {original_filename}")
        
    except Exception as e:
        print(f"Error processing query_id={query_id}: {str(e)}")


##############
query_ids = ['13-0', '17-0', '19-0', '23-0', '33-0']
test_methods = ['cardinality', 'kepler']
train_methods = ['cardinality', 'kepler']
confidence_threshold = "0"

# Generate all combinations
combinations = []
for query_id in query_ids:
    for test_method in test_methods:
        for train_method in train_methods:
            if test_method != train_method:
                combinations.append((query_id, test_method, train_method, training_sizes[0]))

print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, test_method, train_method, training_size in combinations:
    print(f"\nProcessing query_id={query_id}, test_method={test_method}, train_method={train_method}, training_size={training_size}")
    process_data(query_id, test_method, train_method, training_size, confidence_threshold)

Processing 10 combinations...

Processing query_id=13-0, test_method=cardinality, train_method=kepler, training_size=50
Generated: 0_workload_plans/13-0_cardinality_test_kepler_train_size_50.csv
Generated: 0_workload_plans/13-0_cardinality_test_cardinality_train_size_50.csv

Processing query_id=13-0, test_method=kepler, train_method=cardinality, training_size=50
Generated: 0_workload_plans/13-0_kepler_test_cardinality_train_size_50.csv
Generated: 0_workload_plans/13-0_kepler_test_kepler_train_size_50.csv

Processing query_id=17-0, test_method=cardinality, train_method=kepler, training_size=50
Generated: 0_workload_plans/17-0_cardinality_test_kepler_train_size_50.csv
Generated: 0_workload_plans/17-0_cardinality_test_cardinality_train_size_50.csv

Processing query_id=17-0, test_method=kepler, train_method=cardinality, training_size=50
Generated: 0_workload_plans/17-0_kepler_test_cardinality_train_size_50.csv
Generated: 0_workload_plans/17-0_kepler_test_kepler_train_size_50.csv

Processin

#### mixture plans conclusions

In [4]:
import pandas as pd
import json
import re
import os
from itertools import product

def process_data(query_id, training_size):
    """
    Process data for a given query_id and training_size combination.
    
    Args:
        query_id (str): The ID of the query to process
        training_size (int): The training size to process
    """
    # Define the methods to process - now using the new set of methods
    # methods = ['cardinality', 'csv', 'kepler']
    methods = ['cardinality']
    
    for method in methods:
        # Form the predictions file path using the new structure
        predictions_path = f'0_mixture_test/{query_id}/{method}/training_{training_size}/confidence_0/predictions/'
        
        # Find the file that matches the pattern starting with query_id and ending with _batch_0.csv
        predictions_file = None
        try:
            for filename in os.listdir(predictions_path):
                if filename.startswith(f'{query_id}_') and filename.endswith('_batch_0.csv'):
                    predictions_file = os.path.join(predictions_path, filename)
                    break
        except FileNotFoundError:
            print(f"Directory not found for {method}, query_id={query_id}, training_size={training_size}")
            continue
                
        if predictions_file is None:
            print(f"No prediction file found for {method}, query_id={query_id}, training_size={training_size}")
            continue
        
        # Read and process the predictions file
        predictions_df = pd.read_csv(predictions_file)
        
        # Read the testing data JSON
        testing_json_path = f'0_mixture_test/{query_id}/{query_id}_mixture_test.json'
        try:
            with open(testing_json_path, 'r') as f:
                query_data = json.load(f)
        except FileNotFoundError:
            print(f"Testing JSON not found for query_id={query_id}")
            continue
        
        # Process each prediction row
        results = []
        for _, pred_row in predictions_df.iterrows():
            params = pred_row['params']
            plan_id = pred_row['plan_id']
            plan_content = pred_row['plan_content']
            
            # Convert params from string representation to list
            params_list = eval(params)
            
            # Get full query instance
            query = query_data[query_id]['query']
            
            # Replace parameters in query with actual values
            for i, param in enumerate(params_list):
                param = str(param).strip()
                pattern = re.compile(rf"@param{i}\b")
                query = pattern.sub(param, query)
            
            results.append({
                'query': query,
                'plan_id': plan_id,
                'plan_content': plan_content
            })
        
        # Save results to CSV
        output_df = pd.DataFrame(results)
        os.makedirs('0_mixture_plans', exist_ok=True)
        output_filename = f'0_mixture_plans/{query_id}_workload_{method}_training_size_{training_size}.csv'
        output_df.to_csv(output_filename, index=False)
        print(f"Generated: {output_filename}")

# Define separate lists for query_ids and training sizes
query_ids = ['3-0']
training_sizes = [400]

# Generate all combinations using itertools.product
combinations = list(product(query_ids, training_sizes))

# Display the total number of combinations to be processed
print(f"Processing {len(combinations)} combinations...")

# Process each combination
for query_id, training_size in combinations:
    print(f"\nProcessing query_id={query_id}, training_size={training_size}")
    process_data(query_id, training_size)

Processing 1 combinations...

Processing query_id=3-0, training_size=400
Generated: 0_mixture_plans/3-0_workload_cardinality_training_size_400.csv


### Others

#### original single testing param file generate

In [2]:
import json
import os
import shutil

def process_json_file(query_id, method):
    base_path = "0_finished_repo"
    input_path = os.path.join(
        base_path,
        f"imdb_{query_id}_original",
        method,
        "inputs",
        "testing",
        f"{query_id}_testing_original.json"
    )
    
    output_path = os.path.join(
        base_path,
        f"imdb_{query_id}_original",
        method,
        "inputs",
        "testing",
        f"{query_id}_testing_original_single.json"
    )
    
    try:
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
            
        # read original file
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        if query_id not in data:
            raise KeyError(f"Query ID {query_id} not found in the JSON data")
            
        if "params" not in data[query_id]:
            raise KeyError(f"'params' not found in data[{query_id}]")
            
        # only keep the first one
        if len(data[query_id]["params"]) > 0:
            data[query_id]["params"] = [data[query_id]["params"][0]]
        
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
            
        print(f"Successfully created: {output_path}")
        return True
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return False

def main():
    for query_id in [f"{i}-0" for i in range(1, 34) if i != 29]:
        for method in ['cardinality', 'csv', 'kepler']:
            success = process_json_file(query_id, method)
            if success:
                print("Processing completed successfully")
            else:
                print("Processing failed")

if __name__ == "__main__":
    main()

Successfully created: 0_finished_repo/imdb_1-0_original/cardinality/inputs/testing/1-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_1-0_original/csv/inputs/testing/1-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_1-0_original/kepler/inputs/testing/1-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_2-0_original/cardinality/inputs/testing/2-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_2-0_original/csv/inputs/testing/2-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_2-0_original/kepler/inputs/testing/2-0_testing_original_single.json
Processing completed successfully
Successfully created: 0_finished_repo/imdb_3-0_original/cardinality/inputs/testing/3-0_testing_original_single.json
Processing

#### query template (1 predicate with multiple join_table_alias)

In [11]:
import os
import json

# Create query_ids using set comprehension
query_ids = {f"{i}-0" for i in range(1, 34) if i != 29}
overall = set()

for query_id in query_ids:
    file_path = f"imdb_input/original_template/{query_id}.json"
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        predicates = data[query_id]['predicates']
        
        # Check if any predicate has more than 1 join_tables_alias
        for predicate in predicates:
            if len(predicate['join_tables_alias']) > 1:
                overall.add(query_id)
                break
    except FileNotFoundError:
        print(f"File not found for query {query_id}")

print("Overall:", sorted(list(overall)))
print("Ignore:", sorted(list(query_ids - overall)))

Overall: ['1-0', '10-0', '11-0', '12-0', '14-0', '15-0', '16-0', '18-0', '19-0', '20-0', '21-0', '22-0', '23-0', '24-0', '25-0', '26-0', '27-0', '28-0', '3-0', '30-0', '31-0', '33-0', '4-0', '5-0', '6-0', '7-0', '8-0', '9-0']
Ignore: ['13-0', '17-0', '2-0', '32-0']


#### query template (same table with multiple predicates)

In [2]:
import os

query_ids = [f"{i}-0" for i in range(1, 34) if i != 29]
overall = []

for query_id in query_ids:
    file_path = f"0_finished_repo/imdb_{query_id}_original/csv/inputs/PQO/join_predicates/{query_id}_50_training.txt"
    try:
        with open(file_path, 'r') as f:
            first_line = f.readline().strip()
            contains_and = "AND" in first_line
            print(f"{query_id}: {first_line}")
            print(f"Contains AND: {contains_and}\n")
            if contains_and: overall.append(query_id)
    except FileNotFoundError:
        print(f"File not found for query {query_id}")

print("Overall:", overall)

1-0: ["(mc.note LIKE '%(200%)%' OR mc.note LIKE '%(France)%') AND mc.note NOT LIKE '%(USA)%'", "ct.kind = 'production companies'", "it.info = 'votes'"],
Contains AND: True

2-0: ["cn.country_code = '[us]'", "k.keyword = 'marvel-cinematic-universe'"],
Contains AND: False

3-0: ["k.keyword LIKE '%sequel%'", "mi.info IN ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War')", "t.production_year > 2000"],
Contains AND: False

4-0: ["it.info = 'rating'", "k.keyword LIKE '%sequel%'", "mi_idx.info > '6.5'", "t.production_year > 2010"],
Contains AND: False

5-0: ["ct.kind = 'distributors'", "mc.note LIKE '%(VHS)%' AND mc.note LIKE '%(Japan)%'", "mi.info IN ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War')", "t.production_year > 2000"],
Contains AND: True

6-0: ["k.keyword = 'sequel'", "n.name LIKE '%G%'", "t.production_year > 2005"],
Contains AND: False

7-0: ["an.name LIKE 'H%'", "it.info = 'mini biography'", "lt.link = 'features'", "n.gender = 'm' OR (n.gender = 'f' AND n.name L

#### query template OR condition check

In [14]:
import os
import json
import glob

def search_or_queries(directory_path):
    matches = []   
    search_pattern = os.path.join(directory_path, '*.json')

    for json_file in glob.glob(search_pattern):
        try:
            query_id = os.path.basename(json_file)[:-5]
            
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            if (query_id in data and 
                "query" in data[query_id] and 
                " OR " in data[query_id]["query"]):
                matches.append((query_id, data[query_id]["query"]))
                
        except Exception as e:
            print(f"ERROR {json_file}: {str(e)}")
    
    return matches

def main():
    directory = "imdb_input/original_template"
    results = search_or_queries(directory)
    
    print(f"\nfound {len(results)} contains OR predicate:")
    for query_id, query in sorted(results):
        print(f"\nname: {query_id}")
        print(f"query: {query}")

if __name__ == "__main__":
    main()


found 12 contains OR predicate:

name: 1-0
query: SELECT *
 FROM company_type AS ct,
 info_type AS it,
 movie_companies AS mc,
 movie_info_idx AS mi_idx,
 title AS t
 WHERE ct.id = mc.company_type_id
 AND t.id = mc.movie_id
 AND t.id = mi_idx.movie_id
 AND mc.movie_id = mi_idx.movie_id
 AND it.id = mi_idx.info_type_id
 AND ct.kind = '@param0'
 AND it.info = '@param1'
 AND mc.note NOT LIKE '@param2'
 AND (mc.note LIKE '@param3' OR mc.note LIKE '@param4');

name: 11-0
query: SELECT *
FROM company_name AS cn,
 company_type AS ct,
 keyword AS k,
 link_type AS lt,
 movie_companies AS mc,
 movie_keyword AS mk,
 movie_link AS ml,
 title AS t
WHERE lt.id = ml.link_type_id
 AND ml.movie_id = t.id
 AND t.id = mk.movie_id
 AND mk.keyword_id = k.id
 AND t.id = mc.movie_id
 AND mc.company_type_id = ct.id
 AND mc.company_id = cn.id
 AND ml.movie_id = mk.movie_id
 AND ml.movie_id = mc.movie_id
 AND mk.movie_id = mc.movie_id
AND cn.country_code != '@param0'
AND (cn.name LIKE '@param1' OR cn.name LIKE

#### pqo table alias

In [14]:
import json
import ast

def get_aliases(query_id):
    file_path = f"imdb_{query_id}_original/cardinality/inputs/testing/{query_id}_testing_original.json"
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    predicates = data[query_id]["predicates"]
    
    alias_list = []
    for pred in predicates:
        alias_list.append(pred.get("original_alias", pred["alias"]))
    
    print(f'"{query_id}": {alias_list}')
    
    join_file_path = f"imdb_{query_id}_original/cardinality/inputs/PQO/join_predicates/{query_id}_50_training.txt"
    
    with open(join_file_path, 'r') as f:
        first_line = f.readline().strip()
        predicates_list = ast.literal_eval(first_line)
        table_names = []
        for pred in predicates_list:
            if isinstance(pred, str):
                table = pred.split('.')[0].strip('()"')
                table_names.append(table)
            elif isinstance(pred, list):
                for p in pred:
                    if isinstance(p, str) and '.' in p:
                        table = p.split('.')[0].strip('()"')
                        table_names.append(table)
        
    print(f'"{query_id}": {table_names}\n')
    
    return alias_list, table_names

for query_id in ["22-0", "23-0", "24-0", "25-0", "26-0", "27-0", "28-0", "30-0", "31-0", "32-0", "33-0"]:
    aliases, tables = get_aliases(query_id)

"22-0": ['cn', 'it1', 'it2', 'k', 'kt', 'mc', 'mc', 'mi', 'mi_idx', 't']
"22-0": ['cn', 'it', 'it', 'k', 'kt', 'mc', 'mi', 'mi_idx', 't']

"23-0": ['cct', 'cn', 'it', 'kt', 'mi', 'mi', 'mi', 't']
"23-0": ['mi', 'cct', 'cn', 'it', 'kt', 't']

"24-0": ['ci', 'cn', 'it', 'k', 'mi', 'mi', 'n', 'n', 'rt', 't']
"24-0": ['mi', 'ci', 'cn', 'it', 'k', 'n', 'rt', 't']

"25-0": ['ci', 'it1', 'it2', 'k', 'mi', 'n']
"25-0": ['ci', 'it', 'it', 'k', 'mi', 'n']

"26-0": ['cct1', 'cct2', 'chn', 'chn', 'it2', 'k', 'kt', 'mi_idx', 't']
"26-0": ['chn', 'cct', 'cct', 'it', 'k', 'kt', 'mi_idx', 't']

"27-0": ['cct1', 'cct2', 'cn', 'cn', 'cn', 'ct', 'k', 'lt', 'mi', 't', 't']
"27-0": ['cn', 'cct', 'cct', 'ct', 'k', 'lt', 'mi', 't']

"28-0": ['cct1', 'cct2', 'cn', 'it1', 'it2', 'k', 'kt', 'mc', 'mc', 'mi', 'mi_idx', 't']
"28-0": ['cct', 'cct', 'cn', 'it', 'it', 'k', 'kt', 'mc', 'mi', 'mi_idx', 't']

"30-0": ['cct1', 'cct2', 'ci', 'it1', 'it2', 'k', 'mi', 'n', 't']
"30-0": ['cct', 'cct', 'ci', 'it', 'it', 'k',

#### mixture query generate

In [2]:
import json
import glob
import random
from pathlib import Path

def process_files(query_id):
    # Set up base path and each component's paths
    base_path = f"imdb_{query_id}_original"
    
    # Each component has its own directory with a testing file
    card_path = Path(base_path) / "cardinality"
    csv_path = Path(base_path) / "csv"
    kepler_path = Path(base_path) / "kepler"
    
    # Testing file path is relative to each component directory
    test_path = f"inputs/testing/{query_id}_testing_original.json"
    
    # Read data from cardinality + its testing file
    with open(card_path / test_path) as f:
        card_test_data = json.load(f)
    
    # Read data from csv + its testing file
    with open(csv_path / test_path) as f:
        csv_test_data = json.load(f)
        
    # Read data from kepler + its testing file
    with open(kepler_path / test_path) as f:
        kepler_test_data = json.load(f)

    # Verify consistency of query and predicates across all datasets
    if not verify_consistency([card_test_data, csv_test_data, kepler_test_data], query_id):
        raise ValueError(f"Inconsistent query or predicates for {query_id}")

    # Merge params from all sources
    all_params = (card_test_data[query_id]['params'] + 
                 csv_test_data[query_id]['params'] + 
                 kepler_test_data[query_id]['params'])
    
    # Shuffle and sample 200 sets of params (600 total)
    random.seed(2024)
    random.shuffle(all_params)
    sampled_params = all_params[:200]
    
    # Build result in required format
    result = {
        query_id: {
            "query": card_test_data[query_id]['query'],  # They should all be the same at this point
            "predicates": card_test_data[query_id]['predicates'],
            "params": sampled_params
        }
    }
    
    return result

def verify_consistency(data_list, query_id):
    """
    Verifies that query and predicates are consistent, but only checks predicates
    between the first two datasets (cardinality and csv).
    
    Args:
        data_list: List containing [card_test_data, csv_test_data, kepler_test_data]
        query_id: The query ID being processed
    Returns:
        True if consistent according to our rules, False otherwise
    """
    if len(data_list) < 3:  # We need all three datasets
        return False
    
    # Unpack for clarity
    card_data, csv_data, kepler_data = data_list[0], data_list[1], data_list[2]
    
    # Check query consistency across all three datasets
    query_consistent = (
        card_data[query_id]['query'] == csv_data[query_id]['query'] == 
        kepler_data[query_id]['query']
    )
    
    # Check predicates only between cardinality and csv data
    predicates_consistent = (
        card_data[query_id]['predicates'] == csv_data[query_id]['predicates']
    )
    
    return query_consistent and predicates_consistent

def process_all_queries():
    # Get all query IDs from directory names
    pattern = "imdb_*_original"
    all_dirs = glob.glob(pattern)
    query_ids = [d.split('_')[1] for d in all_dirs]
    print(query_ids)
    
    # Process each query and save its result separately
    for qid in query_ids:
        # Get the result for this query
        result = process_files(qid)
        
        # Create filename for this query's result
        output_dir = Path(f"0_mixture_test/{qid}")
        output_dir.mkdir(parents=True, exist_ok=True)
        output_filename = output_dir / f"{qid}_mixture_test.json"
        
        # Save this query's result to its own file
        with open(output_filename, 'w') as f:
            json.dump(result, f, indent=2)

# Execute the processing
# process_all_queries()

['33-0']
