### need split.json

In [17]:
import json
import os

def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_params(value, param_splits, param_index):
    """
    Extract parameters based on the splitting rules:
    - For 2 elements: split(ele[0])[1].split(ele[1])[0]
    - For 3 elements: split(ele[0])[1].split(ele[1])[1].split(ele[2])[0]
    Last two parameters should be converted to int
    """
    if len(param_splits) == 2:
        param_value = value.split(param_splits[0])[1].split(param_splits[1])[0]
    elif len(param_splits) == 3:
        param_value = value.split(param_splits[0])[1].split(param_splits[1])[1].split(param_splits[2])[0]
    else:
        return None
    
    # Convert last two parameters to int
    if param_index >= 14:  # @param14 and @param15
        return int(param_value)
    return param_value

def process_single_dataset(data, split_data):
    """Process a single dataset (either training or testing)"""
    all_params = []
    for key in data.keys():
        value = data[key]
        query_params = []
        
        # Extract parameters for each @param in order
        for i in range(16):  # @param0 to @param15
            param_name = f"@param{i}"
            split_rules = split_data[param_name]
            param_value = extract_params(value, split_rules, i)
            if param_value is not None:
                query_params.append(param_value)
        
        all_params.append(query_params)
    return all_params

def process_queries(method_path, split_data):
    # Define input paths
    training_path = os.path.join(method_path, 'inputs', 'PQO', 'query', '29-0_training_50.json')
    testing_path = os.path.join(method_path, 'inputs', 'PQO', 'query', '29-0_testing.json')
    
    # Load JSON files
    training_data = load_json_file(training_path)
    testing_data = load_json_file(testing_path)
    
    # Process training and testing data separately
    training_params = process_single_dataset(training_data, split_data)
    testing_params = process_single_dataset(testing_data, split_data)
    
    # Save training results with Unicode escapes preserved
    training_output = os.path.join(method_path, "training_params.json")
    with open(training_output, 'w', encoding='utf-8') as f:
        json.dump(training_params, f, indent=4)
        
    # Save testing results with Unicode escapes preserved
    testing_output = os.path.join(method_path, "testing_params.json")
    with open(testing_output, 'w', encoding='utf-8') as f:
        json.dump(testing_params, f, indent=4)

def main():
    # Get method path from user
    method_path = 'kepler'
    
    # Load split data
    split_path = 'PQO_to_Kepler_split.json'
    split_data = load_json_file(split_path)
    
    # Process the queries
    process_queries(method_path, split_data)
    print("Processing complete. training_params.json and testing_params.json have been created.")

if __name__ == "__main__":
    main()

Processing complete. training_params.json and testing_params.json have been created.


In [13]:
import json
import os
import collections

def get_literal_frequencies(literals):
    """
    calculate param with frequency
    """
    frequency_dict = collections.defaultdict(int)
    
    for literal in literals:
        frequency_dict[json.dumps(literal)] += 1
    
    return frequency_dict

def process_params_and_store_frequency(query_id, train_params, test_params, output_dir, train_size_list=[50, 400]):
    """
    Process params and store their frequencies in appropriate directories
    
    Args:
        query_id: The ID of the query
        train_params: List of parameter lists from training set
        test_params: List of parameter lists from testing set
        output_dir: Directory to store the output files
        train_size_list: List of training sizes to process
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Get distinct literals and their frequencies
    distinct_train_literals = list(set(map(tuple, train_params)))
    distinct_train_literals = [list(item) for item in distinct_train_literals]
    
    distinct_test_literals = list(set(map(tuple, test_params)))
    distinct_test_literals = [list(item) for item in distinct_test_literals]
    
    # Calculate frequencies
    train_literal_frequencies = get_literal_frequencies(train_params)
    test_literal_frequencies = get_literal_frequencies(test_params)
    
    # Store all train_dicts in a dictionary, keyed by train_size
    train_dict_dict = {}
    
    # Process full training set
    full_train_dict = {json.dumps(literal): train_literal_frequencies[json.dumps(literal)] 
                      for literal in distinct_train_literals}
    train_dict_dict[len(train_params)] = full_train_dict
    
    # Process test set
    test_dict = {json.dumps(literal): test_literal_frequencies[json.dumps(literal)] 
                 for literal in distinct_test_literals}
    
    # Process different training sizes
    for train_size in train_size_list:
        if train_size > len(train_params):
            print(f"Warning: Requested train_size {train_size} is larger than available training data {len(train_params)}")
            continue
            
        # Get the subset of train_params based on the current train_size
        train_subset = train_params[:train_size]
        new_train_literal_freq = get_literal_frequencies(train_subset)

        # Get distinct train literals for this subset
        distinct_train_subset = list(set(map(tuple, train_subset)))
        distinct_train_subset = [list(item) for item in distinct_train_subset]
        
        # Create a frequency dictionary for this subset
        train_dict = {json.dumps(literal): new_train_literal_freq[json.dumps(literal)] 
                     for literal in distinct_train_subset}
        
        # Add the current train size dictionary
        train_dict_dict[train_size] = train_dict
    
    # Create output directory
    base_dir = "frequency"
    output_dir_path = os.path.join(output_dir, base_dir)
    os.makedirs(output_dir_path, exist_ok=True)

    # Save train frequencies for each size
    for train_size, train_dict in train_dict_dict.items():
        train_output_file = os.path.join(output_dir_path, f"{query_id}_train_{train_size}_freq.json")
        with open(train_output_file, 'w') as train_file:
            json.dump(train_dict, train_file, indent=4)

    # Save test frequencies
    test_output_file = os.path.join(output_dir_path, f"{query_id}_test_freq.json")
    with open(test_output_file, 'w') as test_file:
        json.dump(test_dict, test_file, indent=4)
    
    return train_dict_dict, test_dict

def main():
    # Load params
    method_path = 'kepler'
    training_params = json.load(open(os.path.join(method_path, 'training_params.json')))
    testing_params = json.load(open(os.path.join(method_path, 'testing_params.json')))
    
    # Process for query 29-0
    query_id = '29-0'
    train_dict_dict, test_dict = process_params_and_store_frequency(
        query_id=query_id,
        train_params=training_params,
        test_params=testing_params,
        output_dir=method_path,
        train_size_list=[50]
    )
    
    print(f"Processing complete. Frequency files have been created in {os.path.join(method_path, 'frequency')}")

if __name__ == "__main__":
    main()

Processing complete. Frequency files have been created in kepler/frequency


In [None]:
import json
import os
from shutil import copyfile

def process_json_files(method):
    # 读取原始JSON文件
    with open('29-0.json', 'r') as f:
        original_data = json.load(f)
    
    # 读取testing参数
    params_path = os.path.join(method, 'training_params.json')
    with open(params_path, 'r') as f:
        training_params = json.load(f)
        
    params_path = os.path.join(method, 'testing_params.json')
    with open(params_path, 'r', encoding='utf-8') as f:
        testing_params = json.load(f)
    
    # 创建目录结构
    training_dir = os.path.join(method, 'inputs', 'training')
    testing_dir = os.path.join(method, 'inputs', 'testing')
    os.makedirs(training_dir, exist_ok=True)
    os.makedirs(testing_dir, exist_ok=True)
    
    # 处理training文件
    training_data = original_data.copy()
    training_data['29-0']['params'] = training_params  # 添加参数
    
    # 保存training文件
    training_distinct_path = os.path.join(training_dir, '29-0_training_distinct_50.json')
    training_original_path = os.path.join(training_dir, '29-0_training_original_50.json')
    
    with open(training_distinct_path, 'w') as f:
        json.dump(training_data, f, indent=2)
    with open(training_original_path, 'w') as f:
        json.dump(training_data, f, indent=2)
    
    # 处理testing文件
    testing_data = original_data.copy()
    testing_data['29-0']['params'] = testing_params
    
    # 保存testing文件
    testing_distinct_path = os.path.join(testing_dir, '29-0_testing_distinct.json')
    testing_original_path = os.path.join(testing_dir, '29-0_testing_original.json')
    testing_original_single_path = os.path.join(testing_dir, '29-0_testing_original_single.json')
    
    with open(testing_distinct_path, 'w') as f:
        json.dump(testing_data, f, indent=2)
    with open(testing_original_path, 'w') as f:
        json.dump(testing_data, f, indent=2)
    with open(testing_original_single_path, 'w') as f:
        json.dump(testing_data, f, indent=2)

if __name__ == '__main__':
    method = 'kepler'  # 替换为实际的方法名
    process_json_files(method)

In [20]:
import json
import re

def process_query_instances(method, query_id="29-0", count=251):
    # Read the JSON file
    file_path = f"{method}/inputs/training/{query_id}_training_original_50.json"
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    query = data[query_id]["query"]
    param_list = data[query_id]["params"]
    
    query_instances = []
    counter = 0
    
    for params in param_list:
        test_query = query
        
        for i, param in enumerate(params):
            param = str(param).strip()
            pattern = re.compile(rf"@param{i}\b")
            test_query = pattern.sub(param, test_query)
        
        query_instances.append(test_query)
        counter += 1
        
        if counter == count:
            break
    
    # Save to JSON file
    with open('query_instances.json', 'w') as f:
        json.dump(query_instances, f, indent=2)
    
    return query_instances

# Example usage
if __name__ == "__main__":
    method = "kepler"  # Replace with actual method name
    instances = process_query_instances(method)

In [22]:
import json

def compare_query_instances(method, query_id="29-0"):
    # Read the first JSON file (PQO file)
    pqo_file_path = f"{method}/inputs/PQO/query/{query_id}_training_50.json"
    with open(pqo_file_path, 'r') as f:
        pqo_data = json.load(f)
    pqo_queries = list(pqo_data.values())  # Get just the values
    
    # Read the second JSON file (generated instances)
    instances_file_path = f"query_instances.json"
    with open(instances_file_path, 'r') as f:
        generated_queries = json.load(f)
    
    # Check if lengths match
    if len(pqo_queries) != len(generated_queries):
        print(f"Length mismatch: PQO has {len(pqo_queries)} queries, Generated has {len(generated_queries)} queries")
        return False
    
    # Compare each query
    for i, (pqo_query, gen_query) in enumerate(zip(pqo_queries, generated_queries)):
        if pqo_query.strip() != gen_query.strip():
            print(f"Mismatch at index {i}:")
            print(f"PQO query: {pqo_query}")
            print(f"Generated query: {gen_query}")
            return False
    
    print("All queries match!")
    return True

# Example usage
if __name__ == "__main__":
    method = "kepler"  # Replace with actual method name
    are_equal = compare_query_instances(method)

All queries match!
