In [1]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
import json
import os

with open("assets/dev_20240627/dev.json", "r") as f:
    train_data = json.load(f)
with open("assets/dev_20240627/dev_tables.json", "r") as f:
    schema_data = json.load(f)
processor = ValueLinkingDatasetProcessor(schema_data)


results = []
for query in train_data:
    sql_query = query["SQL"]
    db_id = query["db_id"]
    # Extract tables, columns, and values for the SQL query
    tables, columns, values = processor.extract_tables_columns_and_values(sql_query, db_id)
    results.append({
        "question": query["question"],
        "SQL": sql_query,
        "tables": tables,
        "columns": columns,
        "values": values,
        "db_id": db_id,
        "source": "bird_dev",
        "evidence" : query["evidence"]
    })

# Save results to a JSON file
output_file_path = "assets/value_linking_dataset_bird_dev.json"
with open(output_file_path, "w") as outfile:
    json.dump(results, outfile, indent=4)
    
print(f"Value linkning dataset has been saved to {output_file_path}")

Value linkning dataset has been saved to assets/value_linking_dataset_bird_dev.json


In [2]:

with open("assets/spider_data/dev.json", "r") as f:
    train_data = json.load(f)
with open("assets/spider_data/tables.json", "r") as f:
    schema_data = json.load(f)
# Initialize the SQLQueryProcessor with schema data
processor = ValueLinkingDatasetProcessor(schema_data)


results = []
for query in train_data:
    sql_query = query["query"]
    db_id = query["db_id"]
    # Extract tables, columns, and values for the SQL query
    tables, columns, values = processor.extract_tables_columns_and_values(sql_query, db_id)
    results.append({
        "question": query["question"],
        "SQL": sql_query,
        "tables": tables,
        "columns": columns,
        "values": values,
        "db_id": db_id,
        "source": "spider_dev",
        "evidence" : ""
    })

# Save results to a JSON file
output_file_path = "assets/value_linking_dataset_spider_dev.json"
with open(output_file_path, "w") as outfile:
    json.dump(results, outfile, indent=4)
    
print(f"Value linkning dataset has been saved to {output_file_path}")


Value linkning dataset has been saved to assets/value_linking_dataset_spider_dev.json


In [3]:

with open("assets/spider_data/test.json", "r") as f:
    train_data = json.load(f)
with open("assets/spider_data/test_tables.json", "r") as f:
    schema_data = json.load(f)
# Initialize the SQLQueryProcessor with schema data
processor = ValueLinkingDatasetProcessor(schema_data)


results = []
for query in train_data:
    sql_query = query["query"]
    db_id = query["db_id"]
    # Extract tables, columns, and values for the SQL query
    tables, columns, values = processor.extract_tables_columns_and_values(sql_query, db_id)
    results.append({
        "question": query["question"],
        "SQL": sql_query,
        "tables": tables,
        "columns": columns,
        "values": values,
        "db_id": db_id,
        "source": "spider_test",
        "evidence" : ""
    })

# Save results to a JSON file
output_file_path = "assets/value_linking_dataset_spider_test.json"
with open(output_file_path, "w") as outfile:
    json.dump(results, outfile, indent=4)
    
print(f"Value linkning dataset has been saved to {output_file_path}")

output_file_path_list = "assets/value_linking_dataset_list_spider_test.json"
processor.format_value_strings(output_file_path, output_file_path_list)

Value linkning dataset has been saved to assets/value_linking_dataset_spider_test.json


In [4]:
with open("assets/train/train.json", "r") as f:
    train_data = json.load(f)
with open("assets/train/train_tables.json", "r") as f:
    schema_data = json.load(f)
# Initialize the SQLQueryProcessor with schema data
processor = ValueLinkingDatasetProcessor(schema_data)


results = []
for query in train_data:
    sql_query = query["SQL"]
    db_id = query["db_id"]
    # Extract tables, columns, and values for the SQL query
    tables, columns, values = processor.extract_tables_columns_and_values(sql_query, db_id)
    results.append({
        "question": query["question"],
        "SQL": sql_query,
        "tables": tables,
        "columns": columns,
        "values": values,
        "db_id": db_id,
        "source": "bird_train",
    })

# Save results to a JSON file
output_file_path = "assets/value_linking_dataset_bird_train.json"
with open(output_file_path, "w") as outfile:
    json.dump(results, outfile, indent=4)
    
print(f"Value linkning dataset has been saved to {output_file_path}")

Value linkning dataset has been saved to assets/value_linking_dataset_bird_train.json


In [4]:
input_paths = [
    "assets/value_linking_dataset_bird_dev.json",
    "assets/value_linking_dataset_spider_dev.json",
    "assets/value_linking_dataset_spider_test.json",
]
output_paths = [
    "assets/value_linking_dataset_bird_dev_valid_values.json",
    "assets/value_linking_dataset_spider_dev_valid_values.json",
    "assets/value_linking_dataset_spider_test_valid_values.json",
]

for input_path, output_path in zip(input_paths, output_paths):
    processor.filter_json_file(input_path, output_path)
    #print the number of records in the output file
    with open(output_path, "r") as f:
        data = json.load(f)
    print(f"Number of records in {output_path}: {len(data)}")
    
output_file_path = "assets/value_linking_valid_values.json"
#merge the three output files and dump to output_file_path
merged_data = []
for output_path in output_paths:
    with open(output_path, "r") as f:
        data = json.load(f)
        merged_data.extend(data)
with open(output_file_path, "w") as outfile:
    json.dump(merged_data, outfile, indent=4)
output_file_path_list = "assets/value_linking_valid_values_list.json"
processor.format_value_strings(output_file_path, output_file_path_list)

with open(output_file_path, "r") as f:
    data = json.load(f)
    print(f"Number of records in {output_file_path}: {len(data)}")

with open(output_file_path_list, "r") as f:
    data_list = json.load(f)
    print(f"Number of records in {output_file_path_list}: {len(data)}")

final_records = []
for record,list_values in zip(data, data_list):
    to_append = record
    #add the list as a field to the record
    to_append["values_list"] = list_values
    final_records.append(to_append)
# Save results to a JSON file
output_file_path = "assets/value_linking_valid_values_no_bird_train.json"
with open(output_file_path, "w") as outfile:
    json.dump(final_records, outfile, indent=4)

#print the number of records in the output file
with open(output_file_path, "r") as f:
    data = json.load(f)
    print(f"Number of records in {output_file_path}: {len(data)}")

Number of records in assets/value_linking_dataset_bird_dev_valid_values.json: 976
Number of records in assets/value_linking_dataset_spider_dev_valid_values.json: 122
Number of records in assets/value_linking_dataset_spider_test_valid_values.json: 229
Number of records in assets/value_linking_valid_values.json: 1327
Number of records in assets/value_linking_valid_values_list.json: 1327
Number of records in assets/value_linking_valid_values_no_bird_train.json: 1327


In [6]:
input_paths = [
    "assets/value_linking_dataset_bird_dev.json",
    "assets/value_linking_dataset_spider_dev.json",
    "assets/value_linking_dataset_spider_test.json",
    "assets/value_linking_dataset_bird_train.json"
]
output_paths = [
    "assets/value_linking_dataset_bird_dev_valid_values.json",
    "assets/value_linking_dataset_spider_dev_valid_values.json",
    "assets/value_linking_dataset_spider_test_valid_values.json",
    "assets/value_linking_dataset_bird_train_valid_values.json"
]

for input_path, output_path in zip(input_paths, output_paths):
    processor.filter_json_file(input_path, output_path)
    #print the number of records in the output file
    with open(output_path, "r") as f:
        data = json.load(f)
    print(f"Number of records in {output_path}: {len(data)}")
    
output_file_path = "assets/value_linking_valid_values.json"
#merge the three output files and dump to output_file_path
merged_data = []
for output_path in output_paths:
    with open(output_path, "r") as f:
        data = json.load(f)
        merged_data.extend(data)
with open(output_file_path, "w") as outfile:
    json.dump(merged_data, outfile, indent=4)
output_file_path_list = "assets/value_linking_valid_values_list.json"
processor.format_value_strings(output_file_path, output_file_path_list)

with open(output_file_path, "r") as f:
    data = json.load(f)
    print(f"Number of records in {output_file_path}: {len(data)}")

with open(output_file_path_list, "r") as f:
    data_list = json.load(f)
    print(f"Number of records in {output_file_path_list}: {len(data)}")

final_records = []
for record,list_values in zip(data, data_list):
    to_append = record
    #add the list as a field to the record
    to_append["values_list"] = list_values
    final_records.append(to_append)
# Save results to a JSON file
output_file_path = "assets/value_linking_valid_values.json"
with open(output_file_path, "w") as outfile:
    json.dump(final_records, outfile, indent=4)

#print the number of records in the output file
with open(output_file_path, "r") as f:
    data = json.load(f)
    print(f"Number of records in {output_file_path}: {len(data)}")

Number of records in assets/value_linking_dataset_bird_dev_valid_values.json: 976
Number of records in assets/value_linking_dataset_spider_dev_valid_values.json: 122
Number of records in assets/value_linking_dataset_spider_test_valid_values.json: 229
Number of records in assets/value_linking_dataset_bird_train_valid_values.json: 6286
Number of records in assets/value_linking_valid_values.json: 7613
Number of records in assets/value_linking_valid_values_list.json: 7613
Number of records in assets/value_linking_valid_values.json: 7613


In [5]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
input_file="assets/value_linking_valid_values_no_bird_train.json"
output_file = "assets/value_linking_valid_values_exact_no_bird_train.json"

ValueLinkingDatasetProcessor.filter_json_by_question_values(input_file, output_file)

Final number of records: 1006


In [8]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
input_file="assets/value_linking_valid_values.json"
output_file = "assets/value_linking_valid_values_exact.json"

ValueLinkingDatasetProcessor.filter_json_by_question_values(input_file, output_file)

Final number of records: 5547


In [6]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor

input_file= "assets/value_linking_valid_values_exact_no_bird_train.json"
output_file = "assets/value_linking_valid_values_typos.json"
ValueLinkingDatasetProcessor.introduce_typos_in_question(input_file, output_file)

Final number of records: 1006


In [None]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
input_file="assets/value_linking_valid_values_exact.json"
output_file = "assets/value_linking_valid_values_synonyms.json"
ValueLinkingDatasetProcessor.generate_synonyms_with_vllm_parsed(input_file, output_file)

In [3]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
input_file= "assets/value_linking_valid_values_exact_no_bird_train.json"
output_path = "CHESS/data/value_linking/value_linking_valid_values_exact_no_bird_train.json"
ValueLinkingDatasetProcessor.prepare_data_chess(input_file, output_path)

In [3]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
input_file= "assets/value_linking_valid_values_exact_no_bird_train.json"
output_folder = "/data/hdd1/users/akouk/value_linking/fresh_value_linking/experimental-analysis-of-value-inking/OmniSQL/value_linking/databases"
ValueLinkingDatasetProcessor.copy_databases(input_file, output_folder)

In [2]:
import json
def merge_json_files(input_file_paths: list[str], output_file_path: str):
    """
    Merges multiple JSON files (each containing a list of records) into a single
    JSON file containing a list of all records.

    Args:
        input_file_paths: A list of paths to the input JSON files.
        output_file_path: The path where the merged JSON file will be saved.
    """
    all_records = []
    for file_path in input_file_paths:
        with open(file_path, 'r', encoding='utf-8') as infile:
            records = json.load(infile)
            all_records.extend(records)

    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(all_records, outfile, indent=4)
        
input_file_paths = [
    "assets/dev_20240627/dev_tables.json",
    "assets/spider_data/tables.json",
    "assets/spider_data/test_tables.json",
]

output_file_path = "assets/merged_tables.json"
merge_json_files(input_file_paths, output_file_path)

In [3]:
input_file = "assets/merged_tables.json"
input_folder = "CHESS/data/value_linking/databses"
output_file = "CHESS/data/value_linking/tables.json"

import json
from pathlib import Path

def filter_json_by_existing_db_ids(
    merged_json_path: str,
    databases_root_folder: str,
    output_json_path: str
):
    """
    Filters records in a JSON file based on whether their 'db_id' corresponds
    to an existing subdirectory in the databases_root_folder.

    Args:
        merged_json_path: Path to the input JSON file (list of records).
        databases_root_folder: Path to the folder containing database subdirectories
                               (e.g., 'prepared_databases_output').
        output_json_path: Path where the filtered JSON file will be saved.
    """
    db_root = Path(databases_root_folder)
    existing_db_ids = set()
    for item in db_root.iterdir():
        if item.is_dir():
            existing_db_ids.add(item.name)

    with open(merged_json_path, 'r', encoding='utf-8') as infile:
        all_records = json.load(infile)

    filtered_records = []
    for record in all_records:
        record_db_id = record.get("db_id")
        if record_db_id and record_db_id in existing_db_ids:
            filtered_records.append(record)

    with open(output_json_path, 'w', encoding='utf-8') as outfile:
        json.dump(filtered_records, outfile, indent=4)

filter_json_by_existing_db_ids(
    merged_json_path=input_file,
    databases_root_folder=input_folder,
    output_json_path=output_file
)

In [None]:
from create_value_linking_dataset import ValueLinkingDatasetProcessor
ValueLinkingDatasetProcessor.generate_prompts_for_eval_open_search("OpenSearch-SQL/value_linking/data_preprocess/dev.json","OpenSearch-SQL/Bird/fewshot/questions.json","OpenSearch-SQL/value_linking/fewshot/questions.json")

[2025-05-11 20:09:39,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/akouk/miniconda3/envs/tolis/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/akouk/miniconda3/envs/tolis/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/akouk/miniconda3/envs/tolis/lib/libcufile.so: undefined reference to `pthread_rwlock_trywrlock@GLIBC_2.2.5'
/home/akouk/miniconda3/envs/tolis/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/akouk/miniconda3/envs/tolis/lib/libcufile.so: undefined reference to `pthread_getspecific@GLIBC_2.2.5'
/home/akouk/miniconda3/envs/tolis/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/akouk/miniconda3/envs/tolis/lib/libcufile.so: undefined reference to `pthread_rwlock_timedrdlock@GLIBC_2.2.5'
/home/akouk/miniconda3/envs/tolis/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../

: 

In [1]:
import json

with open('assets/value_linking_valid_values_exact.json', 'r') as f:
    data = json.load(f)

count_with_spaces = 0
for item in data:
    for val_obj in item["values"]:
        if " " in val_obj["value"]:
            count_with_spaces += 1
print(count_with_spaces)

2713
