# Imports and file details

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style("ticks")

import os
import sys
sys.path.insert(0,'..')

import pandas as pd
import json
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import make_meta

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from src.utils import SQLParserSchema, PGLastSchema, get_file_encodings, SQLGlotSchema, SimpleDDLParserSchema,TIDBMysqlSchema

from dask.diagnostics import ProgressBar
ProgressBar().register()

  pd.set_option('max_colwidth', -1)


In [4]:
def read_partial(parser, schema_type, cols):
    _outdir = '../out/'

    if parser == 'sqlparser':
        p = SQLParserSchema()
        _outdir += 'sqlparser'
    elif parser == 'pglast':
        p = PGLastSchema()
        _outdir += 'pglast'
    elif parser == 'sqlglot':
        p = SQLGlotSchema()
        _outdir += 'sqlglot'
    elif parser == 'simple':
        p = SimpleDDLParserSchema()
        _outdir += 'simpleddlparser'
    elif parser == 'tidb':
        p = TIDBMysqlSchema()
        _outdir += 'tidb_mysql'
    
    if schema_type == 'file':
        _schema = p.file_level_schema
        _outdir += '/'
    elif schema_type == 'stmt':
        _schema = p.statement_list_sch
        _outdir += '_details/'
    
    ddf = dd.read_parquet(_outdir,columns=cols, schema=_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')
    return ddf

In [6]:
# Read file details
fd = []
for file in os.listdir('../out/filedetails/'):
        full_filename = "%s/%s" % ('../out/filedetails/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            for item in dict:
                  fd.append(item)

filedetails_df = pd.DataFrame.from_dict(fd)
del fd

# Trying to join errors

In [5]:
pglast_details = read_partial('pglast','stmt',PGLastSchema().statement_list_sch.names)
sqlglot_details = read_partial('sqlglot','stmt',SQLGlotSchema().statement_list_sch.names)

Number of statements parsable by both pglast and sqlglot : 6775428

In [None]:
pglast_joined_sqlglot = dd.merge(
    read_partial('pglast','stmt',['file_id','statement_nr','parsed']),
    read_partial('sqlglot','stmt',['file_id','statement_nr','parsed']),
    on=['file_id','statement_nr'],
    how='inner',indicator=True,suffixes=['_pglast','_sqlglot'])

pglast_joined_sqlglot[(pglast_joined_sqlglot.parsed_pglast == 1) & (pglast_joined_sqlglot.parsed_sqlglot == 1)].count().compute()

Statements parsed by sqlglot using postgres and not parsed by pglast: only 60893

In [None]:
pglast_joined_sqlglot = dd.merge(
    read_partial('pglast','stmt',['file_id','statement_nr','parsed']),
    read_partial('sqlglot','stmt',['file_id','statement_nr','parsed','successful_dialect']),
    on=['file_id','statement_nr'],
    how='inner',indicator=True,suffixes=['_pglast','_sqlglot'])

pglast_joined_sqlglot[
    (pglast_joined_sqlglot.parsed_pglast == 0) & 
    (pglast_joined_sqlglot.parsed_sqlglot == 1) &
    (pglast_joined_sqlglot.successful_dialect == 'postgres')
    ].count().compute()

Statements parsed by pglast and not sqlglot : 657045

In [None]:
pglast_joined_sqlglot = dd.merge(
    read_partial('pglast','stmt',['file_id','statement_nr','parsed']),
    read_partial('sqlglot','stmt',['file_id','statement_nr','parsed','successful_dialect']),
    on=['file_id','statement_nr'],
    how='inner',indicator=True,suffixes=['_pglast','_sqlglot'])

pglast_joined_sqlglot[
    (pglast_joined_sqlglot.parsed_pglast == 1) & 
    (pglast_joined_sqlglot.parsed_sqlglot == 0)
    ].count().compute()

# All

In [13]:
pglast_joined_sqlglot = dd.merge(
    read_partial('pglast','stmt',['file_id','statement_nr','parsed']),
    read_partial('sqlglot','stmt',['file_id','statement_nr','parsed','successful_dialect']),
    on=['file_id','statement_nr'],
    how='inner',suffixes=['_pglast','_sqlglot'])

pglast_sqlglot_tidb = dd.merge(
    pglast_joined_sqlglot,
    read_partial('tidb','stmt',['file_id','statement_nr','parsed']),
    on=['file_id','statement_nr'],
    how='inner',indicator=True,suffixes=['_1','_tidb']
)

pglast_sqlglot_tidb['parsed_count'] = pglast_sqlglot_tidb[['parsed_pglast','parsed_sqlglot','parsed']].sum(axis=1)
display(pglast_sqlglot_tidb.groupby('parsed_count').count().compute())

[########################################] | 100% Completed | 121.00 s


Unnamed: 0_level_0,file_id,statement_nr,parsed_pglast,parsed_sqlglot,successful_dialect,parsed,_merge
parsed_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,776998,776998,776998,776998,14,776998,776998
1,3973316,3973316,3973316,3973316,118655,3973316,3973316
2,10367606,10367606,10367606,10367606,8315466,10367606,10367606
3,5522971,5522971,5522971,5522971,85327,5522971,5522971


Statements parsed by tidb/sqlglot and not pglast

In [14]:
not_pglast = pglast_sqlglot_tidb[(pglast_sqlglot_tidb.parsed_pglast == 0) & 
                                 (pglast_sqlglot_tidb.parsed_sqlglot == 1) & 
                                 (pglast_sqlglot_tidb.parsed == 1) ]
not_pglast.count().compute()

[########################################] | 100% Completed | 129.61 s


file_id               8911878
statement_nr          8911878
parsed_pglast         8911878
parsed_sqlglot        8911878
successful_dialect    7987327
parsed                8911878
_merge                8911878
parsed_count          8911878
dtype: int64

In [15]:
sqlglot_details.head(5)

[########################################] | 100% Completed | 1.00 ss


Unnamed: 0,file_id,statement_id,statement_nr,statement_type,sqlglot_statement_type,successful_dialect,parsed,parsed_num_errors,parse_error,parse_error_start_context,parse_error_highlight,original
0,0,4326188482576572732,0,DROP,Drop,,1,0,,,,DROP TABLE IF EXISTS CUSTOMER CASCADE
1,0,8860407258675620898,1,CREATE,Create,postgres,1,2,Expecting )|Invalid expression / Unexpected token|,CREATE TABLE CUSTOMER(id |,SERIAL|,"CREATE TABLE CUSTOMER(id SERIAL, uuid VARCHAR(200) UNIQUE NOT NULL, firstname VARCHAR(30) NOT NULL, lastname VARCHAR(30) ,email VARCHAR(50),\n contact_number VARCHAR(30) UNIQUE NOT NULL, password VARCHAR(255) NOT NULL, salt VARCHAR(255) NOT NULL ,PRIMARY KEY(id))"
2,0,56769391187729576,2,DROP,Drop,,1,0,,,,DROP TABLE IF EXISTS CATEGORY CASCADE
3,0,6382270680951650118,3,CREATE,Create,postgres,1,2,Expecting )|Invalid expression / Unexpected token|,CREATE TABLE CATEGORY(id |,SERIAL|,"CREATE TABLE CATEGORY(id SERIAL, uuid VARCHAR(200) UNIQUE NOT NULL,category_name VARCHAR(255), PRIMARY KEY (id))"
4,0,-2023644074340248865,4,DROP,Drop,,1,0,,,,DROP TABLE IF EXISTS STATE CASCADE
