In [1]:
from utils.query_handler import QueryHandler

In [2]:
handler = QueryHandler()

In [3]:
from utils.dataset_creator import QueryTemplateGenerator, MimicSchema, BasicSelectGenerator

In [4]:
schema = MimicSchema()
schema.generate_sample_values()
schema.check_status()


        Current Status:
            max_workers set: 35
            sample_size set: 150
            sample values stored: False
        
Default Sample File found at /home/aaryan/Documents/Ashoka/Sem_8/Capstone_Thesis/NL2SQL_MIMIC/data/custom_dataset/sample_data/default.json

        Current Status:
            max_workers set: 35
            sample_size set: 150
            sample values stored: True
        


In [5]:
template = QueryTemplateGenerator(schema)
simple_query_generator = BasicSelectGenerator(schema)

In [6]:
template.random_filter("admissions", "discharge_location")

("discharge_location = 'HEALTHCARE FACILITY'",
 'discharge location matching HEALTHCARE FACILITY')

In [7]:
import random
from tqdm import tqdm

import pandas as pd

store = []
sql_errors = 0
function_errors = 0
queries = []

query_df = pd.DataFrame.from_dict(simple_query_generator.generate(1000))
query_df

Generating Queries: 100%|██████████| 1000/1000 [00:00<00:00, 27740.84it/s]


Unnamed: 0,question,query
0,What are the hadm id of patient procedures whe...,SELECT hadm_id FROM procedures_icd WHERE chart...
1,"Get icd version, icd code for procedure codes ...","SELECT icd_version, icd_code FROM d_icd_proced..."
2,Get all items for omr where seq num more than ...,SELECT * FROM omr WHERE seq_num > 16 OR subjec...
3,"Show me hadm id, subject id for patient diagno...","SELECT hadm_id, subject_id FROM diagnoses_icd ..."
4,"What are the storetime, value, hadm id of date...","SELECT storetime, value, hadm_id FROM datetime..."
...,...,...
995,Get emar id for emar detail where product code...,SELECT emar_id FROM emar_detail WHERE product_...
996,List hadm id from ICU stays give me 100 examples.,SELECT hadm_id FROM icustays LIMIT 100 ;
997,"List valuenum, warning from chart entries.","SELECT valuenum, warning FROM chartevents ;"
998,Show me caregiver id for caregiver where careg...,SELECT caregiver_id FROM caregiver WHERE careg...


In [8]:
import multiprocessing
from tqdm import tqdm

In [9]:
def worker_process(query):
    """Worker function that creates its own connection"""
    try:
        # Create new handler with fresh connection
        handler = QueryHandler()        
        # Execute query and get results
        result = handler.pretty_execute(query)
        return {'success': True, 'result': (result, query)}
    except Exception as e:
        return {'success': False, 'result': query, 'error': str(e)}

In [10]:
def parallel_query_execution(queries, max_workers=4):
    """Execute queries in parallel with proper SQLite handling"""
    if max_workers is None:
        max_workers = min(4, multiprocessing.cpu_count())  # Conservative for SQLite
    
    results = []
    failed_queries = []
    
    with multiprocessing.Pool(max_workers) as pool:
        # Create argument tuples (query)
        tasks = [(query) for query in queries]
        
        # Process with progress bar
        with tqdm(total=len(queries), desc="Executing queries") as pbar:
            for result in pool.imap_unordered(worker_process, tasks, chunksize=5):
                if result['success']:
                    results.append((result['result'][0], result['result'][1]))
                else:
                    failed_queries.append(result['result'])
                    print(f"Query failed: {result['error']}")
                pbar.update(1)
    
    print(f"\nCompleted {len(results)}/{len(queries)} queries successfully")
    print(f"Failed queries: {len(failed_queries)}")
    return results

In [11]:
results = parallel_query_execution(list(query_df["query"]))

Executing queries:   2%|▏         | 16/1000 [00:14<14:18,  1.15it/s]

: 

In [None]:
for i in range(0,1000,50):
    print(results[i][1])
    display(results[i][0])

In [None]:
import sqlite3
import pandas as pd
from datetime import datetime

def save_sqlite_schema_to_markdown(db_path, output_file='database_schema.md'):
    """
    Save complete SQLite database schema to a Markdown file with:
    - Table structures
    - Column details
    - Foreign keys
    """
    
    with sqlite3.connect(db_path) as conn, \
         open(output_file, 'w') as f:
        
        # Write Markdown header
        f.write("# SQLite Database Schema Report\n\n")
        f.write(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"**Database**: `{db_path}`\n")
        f.write("\n---\n\n")
        
        # Get all tables
        tables = pd.read_sql("""
            SELECT name 
            FROM sqlite_master 
            WHERE type='table' 
            AND name NOT LIKE 'sqlite_%'
            ORDER BY name;
        """, conn)
        
        for table in tables['name']:
            # Write table header
            f.write(f"## Table: `{table}`\n\n")
            
            # Get and write table info
            table_info = pd.read_sql(f"PRAGMA table_info({table});", conn)
            f.write("### Columns\n\n")
            f.write(table_info.to_markdown(index=False, tablefmt="github") + "\n\n")
            
            # Get foreign keys if they exist
            fkeys = pd.read_sql(f"PRAGMA foreign_key_list({table});", conn)
            if not fkeys.empty:
                f.write("### Foreign Keys\n\n")
                f.write(fkeys.to_markdown(index=False, tablefmt="github") + "\n\n")
            
            # Get table creation SQL
            creation_sql = pd.read_sql(f"""
                SELECT sql 
                FROM sqlite_master 
                WHERE name = '{table}' 
                AND type = 'table';
            """, conn).iloc[0,0]
            
            f.write(f"### Creation SQL\n\n```sql\n{creation_sql}\n```\n\n")
            f.write("---\n\n")
        
        print(f"Markdown schema saved to {output_file}")

# Usage
save_sqlite_schema_to_markdown('data/mimic_data/mimic4.db', 'mimic4_schema.md')