# Setup / Parsing

In [None]:
-- Download data
!mkdir data
!mkdir output
!curl -o data/schemapile-perm-sqlfiles.tar.gz https://zenodo.org/records/12682521/files/schemapile-perm-sqlfiles.tar.gz
!tar -xf schemapile-perm-sqlfiles.tar.gz data/sqlfiles_permissive

In [None]:
-- Parse SQLGlot 11
!pip install sqlparse pyarrow duckdb sqlglot==11.4.1 
!mkdir output/sqlglot-11
!python sqlglot_eval_11.py

In [None]:
-- Parse SQLGlot 25
!pip install sqlglot==25.5.1
!mkdir output/sqlglot-25
!python sqlglot_eval_25.py

In [None]:
-- Parse pglast baseline
!pip install pglast==6.2
!mkdir output/pglast
!python pglast_eval.py

# Eval

In [250]:
import os
import json
import duckdb
import pandas as pd

In [251]:
con = duckdb.connect('output/ddl_bench.ddb')

In [266]:
output = "output/sqlglot-11/"

files = os.listdir(output)

all = []
count = 0
for i, file in enumerate(files):
    if ".ipynb" in file:
        continue
    ast = json.load(open(output + file))[2]
    all.extend(ast)
    count += sum([a['parsed'] for a in ast])

df = pd.DataFrame.from_records(all)
con.execute("CREATE OR REPLACE TABLE sqlglot_11 AS SELECT * FROM df")
count

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

1395799

In [243]:
output = "output/sqlglot-25/"

files = os.listdir(output)

all = []
count = 0
for i, file in enumerate(files):
    if ".ipynb" in file:
        continue
    ast = json.load(open(output + file))[2]
    all.extend(ast)
    count += sum([a['parsed'] for a in ast])

df = pd.DataFrame.from_records(all)
con.execute("CREATE OR REPLACE TABLE sqlglot_25 AS SELECT * FROM df")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8f13184f0>

In [245]:
output = "output/pglast/"

files = os.listdir(output)

all = []
count = 0
for i, file in enumerate(files):
    if ".ipynb" in file:
        continue
    ast = json.load(open(output + file))[2]
    all.extend(ast)
    count += sum([a['parsed'] for a in ast])

df = pd.DataFrame.from_records(all)
con.execute("CREATE OR REPLACE TABLE pglast AS SELECT * FROM df")
count

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

1263565

In [269]:
con.execute("""SELECT sum(s.parsed)/sum(p.parsed) as parsed, 
sum((len(list_intersect(list_transform(p.table_list, x -> lower(x)), list_transform(s.table_list, x -> lower(x)))) = len(p.table_list))::int) / sum((p.table_list is not null)::int) as tables, 
sum((len(list_intersect(list_transform(p.columns_list, x -> lower(x)), list_transform(s.column_list, x -> lower(x)))) = len(p.columns_list))::int) / sum((p.columns_list is not null)::int) as columns,
sum((p.num_ctr_unique = s.num_ctr_unique)::int) / count(*) as unique,
sum((p.num_ctr_notnull = s.num_ctr_notnull)::int) / count(*) as notnull,
sum((p.num_ctr_primary = s.num_ctr_primary)::int) / count(*) as primary,
sum((p.num_ctr_foreign = s.num_ctr_foreign)::int) / count(*) as foreign,
FROM pglast as p INNER JOIN sqlglot_11 as s 
ON p.file_id = s.file_id AND p.statement_nr = s.statement_id 
WHERE p.parsed = 1
""").fetchdf()

Unnamed: 0,parsed,tables,columns,unique,notnull,primary,foreign
0,0.93404,0.778929,0.857807,0.982371,0.974702,0.970468,0.962652


In [264]:
con.execute("""SELECT sum(parsed) as parsed, count(*) as total, sum(parsed)/count(*) as success_rate FROM sqlglot_11""").fetchdf()

Unnamed: 0,parsed,total,success_rate
0,2282491.0,2713000,0.841316


In [258]:
con.execute("""SELECT sum(s.parsed)/sum(p.parsed) as parsed, 
sum((len(list_intersect(list_transform(p.table_list, x -> lower(x)), list_transform(s.table_list, x -> lower(x)))) = len(p.table_list))::int) / sum((p.table_list is not null)::int) as tables, 
sum((len(list_intersect(list_transform(p.columns_list, x -> lower(x)), list_transform(s.column_list, x -> lower(x)))) = len(p.columns_list))::int) / sum((p.columns_list is not null)::int) as columns,
sum((p.num_ctr_unique = s.num_ctr_unique)::int) / count(*) as unique,
sum((p.num_ctr_notnull = s.num_ctr_notnull)::int) / count(*) as notnull,
sum((p.num_ctr_primary = s.num_ctr_primary)::int) / count(*) as primary,
sum((p.num_ctr_foreign = s.num_ctr_foreign)::int) / count(*) as foreign,
FROM pglast as p INNER JOIN sqlglot_25 as s 
ON p.file_id = s.file_id AND p.statement_nr = s.statement_id 
WHERE p.parsed = 1
""").fetchdf()

Unnamed: 0,parsed,tables,columns,unique,notnull,primary,foreign
0,0.978623,0.954494,0.942057,0.999555,0.982652,0.984404,0.989454


In [259]:
con.execute("""SELECT sum(s.parsed)/sum(p.parsed) as parsed, 
sum((len(list_intersect(list_transform(p.table_list, x -> lower(x)), list_transform(s.table_list, x -> lower(x)))) = len(p.table_list))::int) / sum((p.table_list is not null)::int) as tables, 
sum((len(list_intersect(list_transform(p.columns_list, x -> lower(x)), list_transform(s.column_list, x -> lower(x)))) = len(p.columns_list))::int) / sum((p.columns_list is not null)::int) as columns,
sum((p.num_ctr_unique = s.num_ctr_unique)::int) / count(*) as unique,
sum((p.num_ctr_notnull = s.num_ctr_notnull)::int) / count(*) as notnull,
sum((p.num_ctr_primary = s.num_ctr_primary)::int) / count(*) as primary,
sum((p.num_ctr_foreign = s.num_ctr_foreign)::int) / count(*) as foreign,
FROM pglast as p INNER JOIN sqlglot_25 as s 
ON p.file_id = s.file_id AND p.statement_nr = s.statement_id 
WHERE p.parsed = 1
""").fetchdf()

Unnamed: 0,parsed,tables,columns,unique,notnull,primary,foreign
0,0.978623,0.954494,0.942057,0.999555,0.982652,0.984404,0.989454


In [265]:
con.execute("""SELECT sum(parsed) as parsed, count(*) as total, sum(parsed)/count(*) as success_rate FROM sqlglot_25""").fetchdf()

Unnamed: 0,parsed,total,success_rate
0,2630500.0,2713000,0.969591
