# Imports and file details

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style("ticks")

import os
import sys
sys.path.insert(0,'..')

import pandas as pd
import json
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import make_meta

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from src_new.utils import SQLParserSchema, PGLastSchema, get_file_encodings, SQLGlotSchema, SimpleDDLParserSchema

from dask.diagnostics import ProgressBar
ProgressBar().register()

  pd.set_option('max_colwidth', -1)


In [2]:
def read_partial(parser, schema_type, cols):
    _outdir = '../out_new/'

    if parser == 'sqlparser':
        p = SQLParserSchema()
        _outdir += 'sqlparser'
    elif parser == 'pglast':
        p = PGLastSchema()
        _outdir += 'pglast'
    elif parser == 'sqlglot':
        p = SQLGlotSchema()
        _outdir += 'sqlglot'
    elif parser == 'simple':
        p = SimpleDDLParserSchema()
        _outdir += 'simpleddlparser'
    
    if schema_type == 'file':
        _schema = p.file_level_schema
        _outdir += '/'
    elif schema_type == 'stmt':
        _schema = p.statement_list_sch
        _outdir += '_details/'
    
    ddf = dd.read_parquet(_outdir,columns=cols, schema=_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')
    return ddf

In [3]:
# Read file details
fd = []
for file in os.listdir('../out_new/filedetails/'):
        full_filename = "%s/%s" % ('../out_new/filedetails/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            for item in dict:
                  fd.append(item)

filedetails_df = pd.DataFrame.from_dict(fd)
del fd

# SDP

In [4]:
sdp_ddf = read_partial('simple','file',SimpleDDLParserSchema().file_level_schema.names)
sdp_details_ddf = read_partial('simple','stmt',SimpleDDLParserSchema().statement_list_sch.names)

## File Level SR

In [5]:
sdp_sr = sdp_ddf.groupby('parsed_file')['parsed_file'].count().compute()
sdp_sr = pd.DataFrame({'success':[0,1], 'count':[sdp_sr[sdp_sr.index != 1].sum(),sdp_sr[sdp_sr.index == 1].sum()]})
sdp_sr['percentage'] = sdp_sr['count']/filedetails_df['file_id'].count()
display(sdp_sr)

[########################################] | 100% Completed | 809.55 ms


Unnamed: 0,success,count,percentage
0,0,308265,0.826102
1,1,64891,0.173898


Including exceptions but results given

In [6]:
sdp_exc_ddf = sdp_ddf[(sdp_ddf.parsed_file == 0) | 
                      (sdp_ddf.parsed_file == 1) | 
                      ((sdp_ddf.parsed_file == 2) & 
                        ((sdp_ddf.num_distinct_tables > 0) |
                         (sdp_ddf.num_distinct_columns > 0) |
                         (sdp_ddf.num_constraints > 0)
                        )
                      )
            ]
sdp_exc_sr = sdp_exc_ddf.groupby('parsed_file')['parsed_file'].count().compute()
sdp_exc_sr = pd.DataFrame({'success':[0,1], 'count':[sdp_exc_sr[sdp_exc_sr.index == 0].sum(),sdp_exc_sr[sdp_exc_sr.index != 0].sum()]})
sdp_exc_sr['percentage'] = sdp_exc_sr['count']/filedetails_df['file_id'].count()
display(sdp_exc_sr)

[########################################] | 100% Completed | 1.29 sms


Unnamed: 0,success,count,percentage
0,0,37783,0.101253
1,1,223337,0.598508


In [7]:
sdp_ddf.groupby(['parsed_file','value_error_present'])['parsed_file','value_error_present'].count().compute()
# 1 4.5k
# 0 19.4k

[########################################] | 100% Completed | 655.76 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,parsed_file,value_error_present
parsed_file,value_error_present,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,37714,37714
1,0,62894,62894
1,1,1997,1997
2,0,270482,270482
0,1,69,69


## Matching with pglast

In [8]:
pglast_ddf = read_partial('pglast','file',PGLastSchema().file_level_schema.names)
sdp_ddf = read_partial('simple','file',SimpleDDLParserSchema().file_level_schema.names)

Joining and writing down

In [10]:
pglast_ddf = read_partial('pglast','file',PGLastSchema().file_level_schema.names)
sdp_ddf = read_partial('simple','file',SimpleDDLParserSchema().file_level_schema.names)
# filtering
#pglast_ddf['p'] = pglast_ddf['parsed_file']
#sqlglot_ddf['p'] = sqlglot_ddf['parsed_postgres']
sdp_ddf = sdp_ddf[(sdp_ddf.parsed_file == 1) | 
                    ((sdp_ddf.parsed_file == 2) & 
                    ((sdp_ddf.num_distinct_tables > 0) |
                     (sdp_ddf.num_distinct_columns > 0) |
                     (sdp_ddf.num_constraints > 0)
                    )
                  )]
pglast_ddf = pglast_ddf[pglast_ddf.parsed_file == 1]

try:
    del dict
except:
    pass
# renaming cols
sdp_ddf = sdp_ddf.rename(columns=dict(zip(sdp_ddf.columns, [item.replace("sdp_file_id","file_id") for item in ["sdp_" + item for item in sdp_ddf.columns]])))
pglast_ddf = pglast_ddf.rename(columns=dict(zip(pglast_ddf.columns, [item.replace("pglast_file_id","file_id") for item in ["pglast_" + item for item in pglast_ddf.columns]])))

pglast_joined_sdp= dd.merge(
    sdp_ddf,
    pglast_ddf,
    on=['file_id'],
    how='outer',indicator=True,suffixes=['_sdp','_pglast'])

join_schema = pa.schema([])
join_schema = join_schema.append(pa.field("file_id",pa.string()))

for item in SimpleDDLParserSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("sdp_" + item.name,item.type))
for item in PGLastSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("pglast_" + item.name,item.type))

join_schema = join_schema.append(pa.field("_merge",pa.string()))
#pglast_joined_sdp.to_parquet('../out_new/join__parsed__pglast_sdp/',schema=join_schema,overwrite=True)

[########################################] | 100% Completed | 64.63 s


In [11]:
joined_ddf = dd.read_parquet('../out_new/join__parsed__pglast_sdp/', schema=join_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

In [12]:
joined_ddf.groupby('_merge')['_merge'].count().compute()

# better than sqlglot

[########################################] | 100% Completed | 702.05 ms


_merge
both          54573 
left_only     168764
right_only    9364  
Name: _merge, dtype: int64

How many parsed files match on: parsed + number of extracted tables?
worse than sqlglot

In [13]:
parsed_both_ddf = joined_ddf[joined_ddf['_merge'] == 'both']

# removing create seq stmt cause apparently it affects num tables
import ast
parsed_both_ddf['x'] = parsed_both_ddf.apply(
    lambda row:
    [item for item in ast.literal_eval(row['pglast_counter_str']) if 'CreateSeqStmt' in item],
    axis=1,
    meta=('x', 'string'))
parsed_both_ddf['pglast_num_create_seq'] = parsed_both_ddf.apply(
    lambda row:
    row['x'][0]['CreateSeqStmt'] if len(row['x']) > 0 else 0,
    axis=1,
    meta=('pglast_num_create_seq', 'float'))

parsed_both_ddf['sdp_nt'] = parsed_both_ddf['sdp_num_distinct_tables']
parsed_both_ddf['pglast_nt'] = parsed_both_ddf['pglast_num_distinct_tables'] - parsed_both_ddf['pglast_num_create_seq']

parsed_both_ddf[parsed_both_ddf.sdp_nt == parsed_both_ddf.pglast_nt]['file_id'].count().compute()

[########################################] | 100% Completed | 8.91 ss


30746

Ncols

In [14]:
parsed_both_ddf[parsed_both_ddf.pglast_num_distinct_columns == parsed_both_ddf.sdp_num_distinct_columns]['file_id'].count().compute()

[########################################] | 100% Completed | 9.06 ss


45879

Match on constraints

notnull: 31360
unique: 47930
primary: 30069
foreign: 16188
all: 4541

In [15]:
parsed_both_ddf[(parsed_both_ddf.sdp_num_ctr_notnull == parsed_both_ddf.pglast_num_ctr_notnull) &
                (parsed_both_ddf.sdp_num_ctr_unique == parsed_both_ddf.pglast_num_ctr_unique) &
                (parsed_both_ddf.sdp_num_ctr_primary == parsed_both_ddf.pglast_num_ctr_primary) &
                (parsed_both_ddf.sdp_num_ctr_foreign == parsed_both_ddf.pglast_num_ctr_foreign)
                ]['file_id'].count().compute()

[########################################] | 100% Completed | 9.97 ss


4541

Match on table list

In [16]:
parsed_both_ddf_tl = parsed_both_ddf.dropna(subset=['sdp_table_list','pglast_table_list'])
parsed_both_ddf_tl['sdp_tl'] = parsed_both_ddf_tl.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['sdp_table_list'])),
    axis=1,
    meta=('sdp_tl', 'string'))
parsed_both_ddf_tl['pglast_tl'] = parsed_both_ddf_tl.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['pglast_table_list'])),
    axis=1,
    meta=('pglast_tl', 'string'))

parsed_both_ddf_tl[parsed_both_ddf_tl.pglast_tl == parsed_both_ddf_tl.sdp_tl]['file_id'].count().compute()

[########################################] | 100% Completed | 12.79 s


26105

Match on col list

In [18]:
parsed_both_ddf_cl = parsed_both_ddf.dropna(subset=['sdp_column_list','pglast_columns_list'])
parsed_both_ddf_cl['sdp_cl'] = parsed_both_ddf_cl.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['sdp_column_list'])),
    axis=1,
    meta=('sdp_cl', 'string'))
parsed_both_ddf_cl['pglast_cl'] = parsed_both_ddf_cl.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['pglast_columns_list'])),
    axis=1,
    meta=('pglast_cl', 'string'))

parsed_both_ddf_cl[parsed_both_ddf_cl.pglast_cl == parsed_both_ddf_cl.sdp_cl]['file_id'].count().compute()

[########################################] | 100% Completed | 11.41 s


43555