# Prep & Read all

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style("ticks")

import os
import sys
sys.path.insert(0,'..')

import pandas as pd
import json
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import make_meta

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from src_new.utils import SQLParserSchema, PGLastSchema, get_file_encodings, SQLGlotSchema, SimpleDDLParserSchema,TIDBMysqlSchema, JSqlParserSchema, RustParserSchema

from dask.diagnostics import ProgressBar
ProgressBar().register()

def read_partial(parser, schema_type, cols):
    _outdir = '../out_new/'

    if parser == 'sqlparser':
        p = SQLParserSchema()
        _outdir += 'sqlparser'
    elif parser == 'pglast':
        p = PGLastSchema()
        _outdir += 'pglast'
    elif parser == 'sqlglot':
        p = SQLGlotSchema()
        _outdir += 'sqlglot'
    elif parser == 'simple':
        p = SimpleDDLParserSchema()
        _outdir += 'simpleddlparser'
    elif parser == 'tidb':
        p = TIDBMysqlSchema()
        _outdir += 'tidb_mysql'
    elif parser == 'jsqlparser':
        p = JSqlParserSchema()
        _outdir += 'jsqlparser'
    elif parser == 'rustparser':
        p = RustParserSchema()
        _outdir += 'rustparser'

    if schema_type == 'file':
        _schema = p.file_level_schema
        _outdir += '/'
    elif schema_type == 'stmt':
        _schema = p.statement_list_sch
        _outdir += '_details/'
    
    ddf = dd.read_parquet(_outdir,columns=cols, schema=_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')
    return ddf

# Read file details
fd = []
for file in os.listdir('../out_new/filedetails/'):
        full_filename = "%s/%s" % ('../out_new/filedetails/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            for item in dict:
                  fd.append(item)

filedetails_df = pd.DataFrame.from_dict(fd)
del fd

  pd.set_option('max_colwidth', -1)


## Schema Prep

In [2]:
# schema prep
join_schema = pa.schema([])
join_schema = join_schema.append(pa.field("file_id",pa.string()))

# pglast prep

pglast_ddf = read_partial('pglast','file',PGLastSchema().file_level_schema.names)

import ast
pglast_ddf['x'] = pglast_ddf.apply(
    lambda row:
    #'a',
    [] if row['counter_str'] is None else [item for item in ast.literal_eval(row['counter_str'] or 'None') if 'CreateSeqStmt' in item],
    axis=1,
    meta=('x', 'string'))

pglast_ddf['num_create_seq'] = pglast_ddf.apply(
    lambda row:
    row['x'][0]['CreateSeqStmt'] if len(row['x']) > 0 else 0,
    axis=1,
    meta=('num_create_seq', 'float'))

pglast_ddf = pglast_ddf.drop(columns=['x'])  

pglast_ddf['num_tables_without_create_seq'] = pglast_ddf['num_distinct_tables'] - pglast_ddf['num_create_seq']

pglast_ddf['table_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
pglast_ddf['column_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['columns_list'])) if row['columns_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
pglast_ddf['schema_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
pglast_ddf['view_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
pglast_ddf['db_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))

try:
    del dict
except:
    pass
pglast_ddf = pglast_ddf.rename(columns=dict(zip(pglast_ddf.columns, [item.replace("pglast_file_id","file_id") for item in ["pglast_" + item for item in pglast_ddf.columns]])))

for item in PGLastSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("pglast_" + item.name,item.type))
join_schema = join_schema.append(pa.field("pglast_num_create_seq", pa.float64()))
join_schema = join_schema.append(pa.field("pglast_num_tables_without_create_seq", pa.float64()))

join_schema = join_schema.append(pa.field("pglast_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("pglast_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("pglast_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("pglast_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("pglast_db_list_str", pa.string()))



In [3]:
# sdp prep

sdp_ddf = read_partial('simple','file',SimpleDDLParserSchema().file_level_schema.names)

sdp_ddf['parsed_file_indulgent'] = sdp_ddf.apply(
    lambda row:
    1 if ((row['parsed_file'] == 1) | 
                    ((row['parsed_file'] == 2) & 
                    ((row['num_distinct_tables'] > 0) |
                     (row['num_distinct_columns'] > 0) |
                     (row['num_constraints'] > 0)
                    )
                  )) else 0,
    axis=1,
    meta=('parsed_file_indulgent', 'int'))

sdp_ddf['table_list_str'] = sdp_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
sdp_ddf['column_list_str'] = sdp_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['column_list'])) if row['column_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
sdp_ddf['schema_list_str'] = sdp_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
sdp_ddf['view_list_str'] = sdp_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
sdp_ddf['db_list_str'] = sdp_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))

# renaming cols
sdp_ddf = sdp_ddf.rename(columns=dict(zip(sdp_ddf.columns, [item.replace("sdp_file_id","file_id") for item in ["sdp_" + item for item in sdp_ddf.columns]])))

for item in SimpleDDLParserSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("sdp_" + item.name,item.type))

join_schema = join_schema.append(pa.field("sdp_parsed_file_indulgent", pa.int64()))

join_schema = join_schema.append(pa.field("sdp_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sdp_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sdp_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sdp_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sdp_db_list_str", pa.string()))


In [4]:
sqlglot_ddf = read_partial('sqlglot','file',SQLGlotSchema().file_level_schema.names)

sqlglot_ddf['table_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
sqlglot_ddf['column_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['column_list'])) if row['column_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
sqlglot_ddf['schema_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
sqlglot_ddf['view_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
sqlglot_ddf['db_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))



sqlglot_ddf['postgres_table_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_table_list'])) if row['postgres_table_list'] is not None else None,
    axis=1,
    meta=('postgres_table_list_str', 'string'))
sqlglot_ddf['postgres_column_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_column_list'])) if row['postgres_column_list'] is not None else None,
    axis=1,
    meta=('postgres_column_list_str', 'string'))
sqlglot_ddf['postgres_schema_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_schema_list'])) if row['postgres_schema_list'] is not None else None,
    axis=1,
    meta=('postgres_schema_list_str', 'string'))
sqlglot_ddf['postgres_view_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_view_list'])) if row['postgres_view_list'] is not None else None,
    axis=1,
    meta=('postgres_view_list_str', 'string'))
sqlglot_ddf['postgres_db_list_str'] = sqlglot_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_db_list'])) if row['postgres_db_list'] is not None else None,
    axis=1,
    meta=('postgres_db_list_str', 'string'))

sqlglot_ddf = sqlglot_ddf.rename(columns=dict(zip(sqlglot_ddf.columns, [item.replace("sqlglot_file_id","file_id") for item in ["sqlglot_" + item for item in sqlglot_ddf.columns]])))

for item in SQLGlotSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("sqlglot_" + item.name,item.type))

join_schema = join_schema.append(pa.field("sqlglot_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_db_list_str", pa.string()))

join_schema = join_schema.append(pa.field("sqlglot_postgres_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_postgres_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_postgres_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_postgres_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("sqlglot_postgres_db_list_str", pa.string()))

In [5]:
rust_ddf = read_partial('rustparser','file',RustParserSchema().file_level_schema.names)

rust_ddf['table_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
rust_ddf['column_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['column_list'])) if row['column_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
rust_ddf['schema_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
rust_ddf['view_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
rust_ddf['db_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))



rust_ddf['postgres_table_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_table_list'])) if row['postgres_table_list'] is not None else None,
    axis=1,
    meta=('postgres_table_list_str', 'string'))
rust_ddf['postgres_column_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_column_list'])) if row['postgres_column_list'] is not None else None,
    axis=1,
    meta=('postgres_column_list_str', 'string'))
rust_ddf['postgres_schema_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_schema_list'])) if row['postgres_schema_list'] is not None else None,
    axis=1,
    meta=('postgres_schema_list_str', 'string'))
rust_ddf['postgres_view_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_view_list'])) if row['postgres_view_list'] is not None else None,
    axis=1,
    meta=('postgres_view_list_str', 'string'))
rust_ddf['postgres_db_list_str'] = rust_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['postgres_db_list'])) if row['postgres_db_list'] is not None else None,
    axis=1,
    meta=('postgres_db_list_str', 'string'))

rust_ddf = rust_ddf.rename(columns=dict(zip(rust_ddf.columns, [item.replace("rustparser_file_id","file_id") for item in ["rustparser_" + item for item in rust_ddf.columns]])))

for item in RustParserSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("rustparser_" + item.name,item.type))

join_schema = join_schema.append(pa.field("rustparser_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_db_list_str", pa.string()))

join_schema = join_schema.append(pa.field("rustparser_postgres_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_postgres_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_postgres_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_postgres_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("rustparser_postgres_db_list_str", pa.string()))

In [6]:
tidb_ddf = read_partial('tidb','file',TIDBMysqlSchema().file_level_schema.names)


tidb_ddf['table_list_str'] = tidb_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
tidb_ddf['column_list_str'] = tidb_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['column_list'])) if row['column_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
tidb_ddf['schema_list_str'] = tidb_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
tidb_ddf['view_list_str'] = tidb_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
tidb_ddf['db_list_str'] = tidb_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))

# renaming cols
tidb_ddf = tidb_ddf.rename(columns=dict(zip(tidb_ddf.columns, [item.replace("tidb_file_id","file_id") for item in ["tidb_" + item for item in tidb_ddf.columns]])))

for item in TIDBMysqlSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("tidb_" + item.name,item.type))

join_schema = join_schema.append(pa.field("tidb_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("tidb_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("tidb_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("tidb_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("tidb_db_list_str", pa.string()))


In [7]:
jsql_ddf = read_partial('jsqlparser','file',JSqlParserSchema().file_level_schema.names)


jsql_ddf['table_list_str'] = jsql_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['table_list'])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
jsql_ddf['column_list_str'] = jsql_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['column_list'])) if row['column_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))
jsql_ddf['schema_list_str'] = jsql_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['schema_list'])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('schema_list_str', 'string'))
jsql_ddf['view_list_str'] = jsql_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['view_list'])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
jsql_ddf['db_list_str'] = jsql_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted(row['db_list'])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))

# renaming cols
jsql_ddf = jsql_ddf.rename(columns=dict(zip(jsql_ddf.columns, [item.replace("jsql_file_id","file_id") for item in ["jsql_" + item for item in jsql_ddf.columns]])))

for item in JSqlParserSchema().file_level_schema:
    if item.name != 'file_id':
        join_schema = join_schema.append(pa.field("jsql_" + item.name,item.type))

join_schema = join_schema.append(pa.field("jsql_table_list_str", pa.string()))
join_schema = join_schema.append(pa.field("jsql_column_list_str", pa.string()))
join_schema = join_schema.append(pa.field("jsql_schema_list_str", pa.string()))
join_schema = join_schema.append(pa.field("jsql_view_list_str", pa.string()))
join_schema = join_schema.append(pa.field("jsql_db_list_str", pa.string()))


### Stmt level

In [8]:
# schema prep
stmt_join_schema = pa.schema([])
stmt_join_schema = stmt_join_schema.append(pa.field("file_id",pa.string()))
stmt_join_schema = stmt_join_schema.append(pa.field("statement_nr",pa.string()))

jsql_details_ddf = read_partial('jsqlparser','stmt',['file_id','statement_nr','parsed','parse_error','parse_error_with_brackets','valid_ansi','valid_oracle','valid_mysql','valid_postgres','valid_sqlserver'])
for item in JSqlParserSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in jsql_details_ddf.columns:
        stmt_join_schema = stmt_join_schema.append(pa.field("jsql_" + item.name,item.type))
try:
    del dict
except:
    pass
jsql_details_ddf = jsql_details_ddf.rename(columns=dict(zip(jsql_details_ddf.columns, [item.replace("jsql_file_id","file_id").replace('jsql_statement_nr','statement_nr') for item in ["jsql_" + item for item in jsql_details_ddf.columns]])))
jsql_details_ddf['statement_nr'] = jsql_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string'))  

pglast_details_ddf = read_partial('pglast','stmt',['file_id','statement_nr','parsed','parse_error'])
for item in PGLastSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in pglast_details_ddf.columns:
        stmt_join_schema = stmt_join_schema.append(pa.field("pglast_" + item.name,item.type))
try:
    del dict
except:
    pass
pglast_details_ddf = pglast_details_ddf.rename(columns=dict(zip(pglast_details_ddf.columns, [item.replace("pglast_file_id","file_id").replace('pglast_statement_nr','statement_nr') for item in ["pglast_" + item for item in pglast_details_ddf.columns]])))
pglast_details_ddf['statement_nr'] = pglast_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 

tidb_details_ddf = read_partial('tidb','stmt',['file_id','statement_nr','parsed','parse_error'])
for item in TIDBMysqlSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in tidb_details_ddf.columns:
        stmt_join_schema = stmt_join_schema.append(pa.field("tidb_" + item.name,item.type))
try:
    del dict
except:
    pass
tidb_details_ddf = tidb_details_ddf.rename(columns=dict(zip(tidb_details_ddf.columns, [item.replace("tidb_file_id","file_id").replace('tidb_statement_nr','statement_nr') for item in ["tidb_" + item for item in tidb_details_ddf.columns]])))
tidb_details_ddf['statement_nr'] = tidb_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 

rust_details_ddf = read_partial('rustparser','stmt',['file_id','statement_id','parsed','parsed_generic','parsed_ansi','parsed_postgres','parsed_mysql','parsed_ms','dialect','parse_error'])
rust_details_ddf['statement_nr'] = rust_details_ddf['statement_id']
rust_details_ddf = rust_details_ddf[['file_id','statement_nr','parsed','parsed_generic','parsed_ansi','parsed_postgres','parsed_mysql','parsed_ms','dialect','parse_error']]
for item in RustParserSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in rust_details_ddf.columns:
        stmt_join_schema = stmt_join_schema.append(pa.field("rust_" + item.name,item.type))
try:
    del dict
except:
    pass
rust_details_ddf = rust_details_ddf.rename(columns=dict(zip(rust_details_ddf.columns, [item.replace("rust_file_id","file_id").replace('rust_statement_nr','statement_nr') for item in ["rust_" + item for item in rust_details_ddf.columns]])))
rust_details_ddf['statement_nr'] = rust_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 

sqlglot_details_ddf = read_partial('sqlglot','stmt',['file_id','statement_id','parsed','parsed_none','parsed_postgres','parsed_mysql','parsed_tsql','dialect','parse_error','file_parse_error_start_context','file_parse_error_highlight'])
sqlglot_details_ddf['statement_nr'] = sqlglot_details_ddf['statement_id']
sqlglot_details_ddf = sqlglot_details_ddf[['file_id','statement_nr','parsed','parsed_none','parsed_postgres','parsed_mysql','parsed_tsql','dialect','parse_error','file_parse_error_start_context','file_parse_error_highlight']]
for item in SQLGlotSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in sqlglot_details_ddf.columns:
        stmt_join_schema = stmt_join_schema.append(pa.field("sqlglot_" + item.name,item.type))
try:
    del dict
except:
    pass
sqlglot_details_ddf = sqlglot_details_ddf.rename(columns=dict(zip(sqlglot_details_ddf.columns, [item.replace("sqlglot_file_id","file_id").replace('sqlglot_statement_nr','statement_nr') for item in ["sqlglot_" + item for item in sqlglot_details_ddf.columns]])))
sqlglot_details_ddf['statement_nr'] = sqlglot_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 


## Read

In [9]:
joined_ddf = dd.read_parquet('../out_new/join__parsed__all/', columns = join_schema.names ,schema=join_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

In [13]:
stmt_joined_ddf = dd.read_parquet('../out_new/stmt_join__parsed__all_final/', columns = stmt_join_schema.names ,schema=stmt_join_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

In [11]:
joined_ddf[(joined_ddf.pglast_parsed_file == 1) | (joined_ddf.tidb_parsed_file == 1)].count().compute()

[########################################] | 100% Completed | 19.58 s


file_id                                          256429
pglast_file_path                                 256429
pglast_errors_at_read                            256429
pglast_num_statements                            63937 
pglast_parsed_file                               256429
pglast_file_parse_error                          192492
pglast_num_distinct_tables                       63937 
pglast_table_list                                63261 
pglast_num_distinct_columns                      63937 
pglast_columns_list                              60823 
pglast_num_distinct_schemas                      63937 
pglast_schema_list                               10907 
pglast_num_distinct_dbs                          63937 
pglast_db_list                                   1158  
pglast_view_list                                 2381  
pglast_num_constraints                           63937 
pglast_num_ctr_notnull                           63937 
pglast_num_ctr_unique                           

Incl sqlparser

In [11]:
# excluding SDP
stmt_parsed_ddf = stmt_joined_ddf

sum_cols = ["pglast_parsed", "sqlglot_parsed", "tidb_parsed", "rust_parsed", "jsql_parsed"]

stmt_parsed_ddf['parsed_by_number'] = stmt_parsed_ddf[sum_cols].sum(axis=1)

In [12]:
sqlparser_join_schema = stmt_join_schema
sqlparser_join_schema = sqlparser_join_schema.append(pa.field("parsed_by_number",pa.int64()))


sqlparser_cols = ['file_id','statement_nr','statement_type','has_comments','original','Token.Keyword']
sqlparser_details_ddf = dd.read_parquet('../out/sqlparser_details', #columns=SQLParserSchema().statement_list_sch.names,
                                         columns = sqlparser_cols,
                                         schema=SQLParserSchema().statement_list_sch, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

for item in SQLParserSchema().statement_list_sch:
    if item.name != 'file_id' and item.name != 'statement_nr' and item.name != 'statement_id' and item.name in sqlparser_details_ddf.columns:
        sqlparser_join_schema = sqlparser_join_schema.append(pa.field("sqlparser_" + item.name,item.type))
try:
    del dict
except:
    pass
sqlparser_details_ddf = sqlparser_details_ddf.rename(columns=dict(zip(sqlparser_details_ddf.columns, [item.replace("sqlparser_file_id","file_id").replace('sqlparser_statement_nr','statement_nr') for item in ["sqlparser_" + item for item in sqlparser_details_ddf.columns]])))

stmt_parsed_ddf['statement_nr'] = stmt_parsed_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 
sqlparser_details_ddf['statement_nr'] = sqlparser_details_ddf['statement_nr'].apply(lambda x: str(x),meta=('statement_nr','string')) 

stmt_parsed_ddf_merged_sqlparser = dd.merge(
    stmt_parsed_ddf,
    sqlparser_details_ddf,
    on=['file_id','statement_nr'],
    how='inner',indicator=False,suffixes=[None,'_sqlparser'])
#stmt_parsed_ddf_merged_sqlparser.to_parquet('../out_new/stmt_join__parsed__all_incl_sqlparser/',schema=sqlparser_join_schema,overwrite=True)

In [13]:
stmt_level_incl_sqlparser = dd.read_parquet('../out_new/stmt_join__parsed__all_incl_sqlparser/', columns = sqlparser_join_schema.names ,schema=sqlparser_join_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

# Errors - file level

In [14]:
stmt_level_incl_sqlparser['file_id'].count().compute()

[########################################] | 100% Completed | 9.36 ss


24624837

### Reading tidb file-ids left that are difficult to parse

In [14]:
still_not_parsed_by_tidb = pd.read_csv('tidb_files_left_difficult_to_parse.csv',dtype={'file_id': str})


In [10]:
# excluding SDP
cols = ["file_id",
        "pglast_parsed_file", "pglast_file_parse_error", "pglast_errors_at_read", "pglast_original", "pglast_original_highlight",
        "sqlglot_parsed", "sqlglot_parsed_none", "sqlglot_parsed_postgres", "sqlglot_parsed_mysql", "sqlglot_parsed_tsql", "sqlglot_dialect", "sqlglot_parse_error", "sqlglot_file_parse_error_start_context", "sqlglot_file_parse_error_highlight", "sqlglot_errors_at_read",
        "tidb_parsed_file", "tidb_file_parse_error", "tidb_errors_at_read",
        "rustparser_parsed", "rustparser_parsed_generic", "rustparser_parsed_ansi", "rustparser_parsed_postgres", "rustparser_parsed_mysql", "rustparser_parsed_ms", "rustparser_dialect", "rustparser_parse_error", "rustparser_errors_at_read",
        "jsql_errors_at_read", "jsql_parsed_file", "jsql_file_parse_error", "jsql_file_parse_error_with_brackets"
]

sum_cols = ["pglast_parsed_file", "sqlglot_parsed", "tidb_parsed_file", "rustparser_parsed", "jsql_parsed_file"]

parsed_ddf = joined_ddf[cols]

parsed_ddf['parsed_by_number'] = parsed_ddf[sum_cols].sum(axis=1)

In [11]:
not_parsable = parsed_ddf[parsed_ddf.parsed_by_number == 0].compute(scheduler='processes')
not_parsable['file_id'].count()

[########################################] | 100% Completed | 6.51 ss


89693

In [17]:
not_parsable_tidb_hard = not_parsable.merge(still_not_parsed_by_tidb, left_on='file_id', right_on='file_id',
          suffixes=('_left', '_right'),how='inner')
not_parsable_tidb_hard['file_id'].count()

50419

### adding difficult jsql files

In [18]:
still_not_parsed_by_jsql = pd.read_csv('jsqlparser_files_left_difficult_to_parse.csv',dtype={'file_id': str})

In [19]:
not_parsable_tidb_and_jsql_hard = not_parsable_tidb_hard.merge(still_not_parsed_by_jsql, left_on='file_id', right_on='file_id',
          suffixes=('_left', '_right'),how='inner')
not_parsable_tidb_and_jsql_hard['file_id'].count()

35634

# Seeing if correlation btwn file details and parsability

In [13]:
parsed_subset_file_corr = parsed_ddf[['file_id','parsed_by_number', 'pglast_errors_at_read']].compute()

[                                        ] | 0% Completed | 1.09 ms

[########################################] | 100% Completed | 3.93 sms


Char counts/line counts (aka file size)

In [14]:
import warnings
warnings.filterwarnings("ignore", 'Boolean inputs to the `inclusive` argument are deprecated in favour of `both` or `neither`.')

filedetails_df['char_count_bucket'] = np.select(
    [
        filedetails_df['char_nr'].between(0, 1000, inclusive=True),
        filedetails_df['char_nr'].between(1001, 5000, inclusive=True),
        filedetails_df['char_nr'].between(5001, 25000, inclusive=True),
        filedetails_df['char_nr'].between(25001, 100000, inclusive=True),
        filedetails_df['char_nr'].between(100001, 200000, inclusive=True),
        filedetails_df['char_nr'].between(200001, 300000, inclusive=True),
        filedetails_df['char_nr'].between(300001, 500000, inclusive=True)
    ], 
    [
        1,2,3,4,5,6,7
    ], 
    default=8
)

filedetails_df['line_count_bucket'] = np.select(
    [
        filedetails_df['line_nr'].between(0, 50, inclusive=True),
        filedetails_df['line_nr'].between(51, 150, inclusive=True),
        filedetails_df['line_nr'].between(151, 500, inclusive=True),
        filedetails_df['line_nr'].between(501, 1000, inclusive=True),
        filedetails_df['line_nr'].between(1001, 10000, inclusive=True),
        filedetails_df['line_nr'].between(10001, 50000, inclusive=True)
    ], 
    [
        1,2,3,4,5,6
    ], 
    default=7
)

In [15]:
file_correlation = parsed_subset_file_corr.merge(filedetails_df,on=['file_id'])

In [17]:
file_correlation.head()

Unnamed: 0,file_id,parsed_by_number,pglast_errors_at_read,file_path,file_size,line_nr,char_nr,encoding,encoding_confidence,char_count_bucket,line_count_bucket
0,502790,0,0,../sqlfiles/502790_qsystem_H2.sql,50684,863,50684,utf-8,0.99,4,4
1,494894,1,0,../sqlfiles/494894_init.sql,709,22,709,ascii,1.0,1,1
2,286801,1,0,../sqlfiles/286801_basededatos.sql,15004,258,15004,utf-8,0.99,3,3
3,202803,4,0,../sqlfiles/202803_integrityDBCreation.sql,9458,173,9458,ascii,1.0,3,3
4,435266,1,0,../sqlfiles/435266_openmrs_warehouse_mw_epilepsy_initial.sql,5457,117,5457,ascii,1.0,3,2


In [16]:
line_counts = file_correlation.groupby(['line_count_bucket'],as_index=False)['file_id'].count().rename(columns= {'file_id':'line_count_total_per_bucket'})
line_count_corr = file_correlation.groupby(['line_count_bucket','parsed_by_number'],as_index=False)['file_id'].count().rename(columns={'file_id': 'parsed_count_per_bucket'})
line_count_corr = line_count_corr.merge(line_counts,on=['line_count_bucket'])
line_count_corr['parsed_count_per_bucket_perc'] = line_count_corr['parsed_count_per_bucket']*100/line_count_corr['line_count_total_per_bucket']
line_count_corr = line_count_corr.drop(columns=['line_count_total_per_bucket'])

char_counts = file_correlation.groupby(['char_count_bucket'],as_index=False)['file_id'].count().rename(columns={'file_id': 'char_count_total_per_bucket'})
char_count_corr = file_correlation.groupby(['char_count_bucket','parsed_by_number'],as_index=False)['file_id'].count().rename(columns={'file_id': 'parsed_count_per_bucket'})
char_count_corr = char_count_corr.merge(char_counts,on=['char_count_bucket'])
char_count_corr['parsed_count_per_bucket_perc'] = char_count_corr['parsed_count_per_bucket']*100/char_count_corr['char_count_total_per_bucket']
char_count_corr = char_count_corr.drop(columns=['char_count_total_per_bucket'])

In [83]:
#display(line_count_corr)
# for bucket 6 -> 94 files (46% of bucket) isn't parsable by any compared to roughly 25% for others

#display(char_count_corr)
# no info from this#

Difficulty at read

In [88]:
parsed_subset_file_corr.groupby(['parsed_by_number','pglast_errors_at_read'])['file_id'].count()

parsed_counts = parsed_subset_file_corr.groupby(['parsed_by_number'],as_index=False)['file_id'].count().rename(columns= {'file_id':'parsed_by_number_total'})
errors_at_read_corr = parsed_subset_file_corr.groupby(['parsed_by_number','pglast_errors_at_read'],as_index=False)['file_id'].count().rename(columns={'file_id': 'parsed_count_per_errors'})

errors_at_read_corr = errors_at_read_corr.merge(parsed_counts,on=['parsed_by_number'])

errors_at_read_corr['parsed_count_perc'] = errors_at_read_corr['parsed_count_per_errors']*100/errors_at_read_corr['parsed_by_number_total']
errors_at_read_corr = errors_at_read_corr.drop(columns=['parsed_by_number_total'])
display(errors_at_read_corr)

Unnamed: 0,parsed_by_number,pglast_errors_at_read,parsed_count_per_errors,parsed_count_perc
0,0,0,89542,99.831648
1,0,1,151,0.168352
2,1,0,134890,99.704339
3,1,1,400,0.295661
4,2,0,62819,99.907756
5,2,1,58,0.092244
6,3,0,38780,99.961335
7,3,1,15,0.038665
8,4,0,34507,99.976822
9,4,1,8,0.023178


Encoding

In [92]:
encodings = file_correlation.groupby(['encoding'],as_index=False)['file_id'].count().rename(columns= {'file_id':'encoding_total_per_parsed_by_number'})
encoding_corr = file_correlation.groupby(['encoding','parsed_by_number'],as_index=False)['file_id'].count().rename(columns={'file_id': 'parsed_count_per_encoding'})
encoding_corr = encoding_corr.merge(encodings,on=['encoding'])
encoding_corr['parsed_count_perc'] = encoding_corr['parsed_count_per_encoding']*100/encoding_corr['encoding_total_per_parsed_by_number']

#encoding_corr = encoding_corr.drop(columns=['encoding_total_per_parsed_by_number'])

display(encoding_corr[encoding_corr.parsed_by_number == 0])

Unnamed: 0,encoding,parsed_by_number,parsed_count_per_encoding,encoding_total_per_parsed_by_number,parsed_count_perc
0,Big5,0,18,41,43.902439
6,CP932,0,1,2,50.0
8,CP949,0,16,30,53.333333
13,EUC-JP,0,17,73,23.287671
19,EUC-KR,0,578,1151,50.217202
25,GB2312,0,714,1230,58.04878
31,IBM866,0,7,13,53.846154
34,ISO-8859-1,0,2158,4192,51.479008
40,ISO-8859-5,0,1,4,25.0
44,ISO-8859-7,0,5,6,83.333333


# TODO - see if correlation btwn number of statements in file and parse counts

# Errors - stmt level

In [14]:
# excluding SDP
stmt_parsed_ddf = stmt_joined_ddf

sum_cols = ["pglast_parsed", "sqlglot_parsed", "tidb_parsed", "rust_parsed", "jsql_parsed"]

stmt_parsed_ddf['parsed_by_number'] = stmt_parsed_ddf[sum_cols].sum(axis=1)

In [15]:
stmt_parsed_ddf[(stmt_parsed_ddf.pglast_parsed == 1) | (stmt_parsed_ddf.tidb_parsed == 1)]['file_id'].count().compute()

[########################################] | 100% Completed | 43.74 s


23609965

In [19]:
stmt_parsed_ddf[(stmt_parsed_ddf.pglast_parsed == 0) & (stmt_parsed_ddf.tidb_parsed == 0)]['file_id'].nunique().compute(scheduler='processes')

[########################################] | 100% Completed | 23.66 s


114167

In [18]:
stmt_parsed_ddf.groupby('parsed_by_number')['parsed_by_number'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 31.62 s


parsed_by_number
0.0    454392  
1.0    1109560 
2.0    1890663 
3.0    2318162 
4.0    14289614
5.0    4565115 
Name: parsed_by_number, dtype: int64

In [19]:
stmt_not_parsable_by_any = stmt_parsed_ddf[stmt_parsed_ddf.parsed_by_number == 0]
stmt_not_parsable_by_any['file_id'].nunique().compute(scheduler='processes')

# 76 713

[########################################] | 100% Completed | 30.15 s


76713

Joining with sqlparser info

In [20]:
stmt_level_incl_sqlparser.groupby('parsed_by_number')['parsed_by_number'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 3.35 ss


parsed_by_number
0    454389  
1    1109370 
2    1890300 
3    2318040 
4    14287811
5    4564927 
Name: parsed_by_number, dtype: int64

In [21]:
not_parsable_incl_jsqlparser = stmt_level_incl_sqlparser[stmt_level_incl_sqlparser.parsed_by_number == 0]

In [76]:
# excluding {INSTALL:DROP_TABLES} + {INSTALL:DB_OPTIONS} -> is this a PHP thing? (see https://github.com/intelliants/eSyndiCat/tree/master)

not_parsable_incl_jsqlparser = not_parsable_incl_jsqlparser[
    ~(not_parsable_incl_jsqlparser.sqlparser_original.str.contains("{INSTALL:",regex=False,case=False))]
not_parsable_incl_jsqlparser.groupby('parsed_by_number')['parsed_by_number'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 62.72 s


parsed_by_number
0    435639
Name: parsed_by_number, dtype: int64

In [77]:
not_parsable_incl_jsqlparser.groupby('sqlparser_statement_type')['sqlparser_statement_type'].count().compute(scheduler='processes')

# as hardest to parse we can consider the "UNKNOWN"
# can also merge with not_parsable_tidb_and_jsql_hard

[########################################] | 100% Completed | 58.18 s


sqlparser_statement_type
ALTER                  30728 
COMMIT                 171   
CREATE                 90702 
CREATE OR REPLACE      519   
DELETE                 709   
DROP                   9629  
INSERT                 13148 
MERGE                  39    
SELECT                 11235 
UNKNOWN                276558
UPDATE                 2112  
REPLACE                7     
START                  33    
ROLLBACK               46    
CREATE \nOR REPLACE    1     
CREATE  OR REPLACE     2     
Name: sqlparser_statement_type, dtype: int64

Getting keywords in not parsed statements

In [78]:
getting_keywords = not_parsable_incl_jsqlparser['sqlparser_Token.Keyword'].compute(scheduler='processes')

[########################################] | 100% Completed | 68.96 s


In [None]:
from collections import Counter
kword_counter = Counter()
for lst in getting_keywords:
    for item in lst:
        kword_counter.update({item:1})
kword_counter.most_common(50)

### Looking at GO keyword => GO is not a part of the TSQL language. It's a batch separator used by SQLCMD and SSMS.


In [80]:
go_test = not_parsable_incl_jsqlparser
go_test['ok'] = go_test.apply(
    lambda row:
    1 if "GO" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('ok', 'int'))
go_file_ids = go_test[go_test.ok == 1]['file_id'].unique().compute(scheduler='processes')
go_file_ids.count()

# 6674

[########################################] | 100% Completed | 68.06 s


6674

In [81]:
x = go_file_ids.reset_index()

test_go_file_level = parsed_ddf[['file_id','parsed_by_number']].compute()
test_go_file_level = test_go_file_level.merge(x, on=['file_id'])

#test_go_file_level.count()
# 6674

test_go_file_level.groupby('parsed_by_number')['file_id'].count()

# but are there some that are parsed though??

[########################################] | 100% Completed | 5.26 ss


parsed_by_number
0    6667
1    7   
Name: file_id, dtype: int64

In [82]:
go_test2 = stmt_level_incl_sqlparser[stmt_level_incl_sqlparser.parsed_by_number > 0]
go_test2['ok'] = go_test2.apply(
    lambda row:
    1 if "GO" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('ok', 'int'))

go_test2 = go_test2[go_test2.ok == 1]

# go_file_ids2.count() yep, 1913
# 6674

In [83]:
go_file_ids2 = go_test2[go_test2.ok == 1]['file_id'].unique().compute(scheduler='processes')

[########################################] | 100% Completed | 225.14 s


In [85]:
y = go_file_ids2.reset_index()

test_go_file_level2 = parsed_ddf[['file_id','parsed_by_number']].compute()
test_go_file_level2 = test_go_file_level2.merge(y, on=['file_id'])

test_go_file_level2.groupby('parsed_by_number')['file_id'].count()

[########################################] | 100% Completed | 4.95 sms


parsed_by_number
0    1121
1    824 
Name: file_id, dtype: int64

In [84]:
go_test2.count().compute(scheduler='processes')

[########################################] | 100% Completed | 208.44 s


file_id                                   3048
statement_nr                              3048
jsql_parsed                               3048
jsql_parse_error                          3048
jsql_parse_error_with_brackets            3048
jsql_valid_ansi                           728 
jsql_valid_oracle                         728 
jsql_valid_mysql                          728 
jsql_valid_postgres                       728 
jsql_valid_sqlserver                      728 
pglast_parsed                             3048
pglast_parse_error                        3041
tidb_parsed                               3048
tidb_parse_error                          3029
rust_parsed                               3048
rust_parsed_generic                       0   
rust_parsed_ansi                          0   
rust_parsed_postgres                      0   
rust_parsed_mysql                         0   
rust_parsed_ms                            0   
rust_dialect                              18  
rust_parse_er

### Nevertheless can exclude GO for now

In [87]:
excluding_go = not_parsable_incl_jsqlparser
excluding_go['go_ok'] = excluding_go.apply(
    lambda row:
    1 if "GO" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('go_ok', 'int'))
excluding_go = excluding_go[excluding_go.go_ok == 0]

excluding_go['file_id'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 75.07 s


365015

Getting keywords again

In [88]:
getting_keywords = excluding_go['sqlparser_Token.Keyword'].compute(scheduler='processes')
from collections import Counter
kword_counter = Counter()
for lst in getting_keywords:
    for item in lst:
        kword_counter.update({item:1})
kword_counter.most_common(50)

[########################################] | 100% Completed | 75.82 s


[('TABLE', 124961),
 ('KEY', 84015),
 ('NOT NULL', 79380),
 ('PRIMARY', 77128),
 ('FROM', 58348),
 ('CONSTRAINT', 53384),
 ('REFERENCES', 47943),
 ('FOREIGN', 47592),
 ('table', 42769),
 ('DEFAULT', 41844),
 ('ON', 40019),
 ('INTO', 35868),
 ('NULL', 32247),
 ('WHERE', 30392),
 ('SET', 29028),
 ('BEGIN', 28951),
 ('DELIMITER', 28866),
 ('VALUES', 28202),
 ('ADD', 27722),
 ('END', 27517),
 ('IF', 26106),
 ('key', 24234),
 ('INDEX', 23306),
 ('from', 23304),
 ('COPY', 20084),
 ('stdin', 19925),
 ('primary', 19109),
 ('PROCEDURE', 18670),
 ('THEN', 18621),
 ('UNIQUE', 18596),
 ('TABLESPACE', 18397),
 ('into', 17864),
 ('not null', 17744),
 ('constraint', 17077),
 ('PCTFREE', 16788),
 ('where', 16474),
 ('INITRANS', 16195),
 ('INITIAL', 15915),
 ('MAXEXTENTS', 15645),
 ('NEXT', 15642),
 ('MINEXTENTS', 15582),
 ('END IF', 15572),
 ('on', 15487),
 ('AUTO_INCREMENT', 15035),
 ('ENABLE', 14838),
 ('EXISTS', 14833),
 ('MAXTRANS', 14647),
 ('AND', 14554),
 ('PCTINCREASE', 14445),
 ('foreign', 14

### Looking further at ENABLE

In [89]:
excluding_go = excluding_go [['file_id','statement_nr', 'sqlparser_original',
                               'sqlparser_statement_type','sqlparser_Token.Keyword',
                              'jsql_parse_error','jsql_parse_error_with_brackets', 'pglast_parse_error', 'tidb_parse_error','rust_parse_error','sqlglot_parse_error','sqlglot_file_parse_error_start_context','sqlglot_file_parse_error_highlight']]

In [93]:
enable_test = excluding_go
enable_test['enable_ok'] = enable_test.apply(
    lambda row:
    1 if "ENABLE" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('enable_ok', 'int'))

enable_test[enable_test.enable_ok == 1]['file_id'].count().compute(scheduler='processes')


[########################################] | 100% Completed | 162.50 s


14838

In [96]:
# seeing if it's parsed anywhere
enable_test_check = stmt_level_incl_sqlparser[stmt_level_incl_sqlparser.parsed_by_number > 0]
enable_test_check['enable_ok'] = enable_test_check.apply(
    lambda row:
    1 if "ENABLE" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('enable_ok', 'int'))

enable_test_check['modify_check'] = enable_test_check.apply(
    lambda row:
    1 if "MODIFY" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('modify_check', 'int'))

enable_test_check[(enable_test_check.enable_ok == 1) & (enable_test_check.modify_check == 0)]['file_id'].count().compute(scheduler='processes')

# roughly 36k parsed with ENABLE in list; jsqlparser should be capable of doing it, seems like sqlglot can do it to if it's a MODIFY type and not a new ctr

# roughly 17k without modify

[########################################] | 100% Completed | 293.53 s


17027

In [99]:
enable_test_check[(enable_test_check.enable_ok == 1) & (enable_test_check.modify_check == 0) &
                  (enable_test_check.jsql_parsed == 0)]['file_id'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 335.45 s


237

### Removing ENABLE, looking further

In [101]:
excluding_go_and_enable = excluding_go
excluding_go_and_enable['enable_ok'] = excluding_go_and_enable.apply(
    lambda row:
    1 if "ENABLE" in row["sqlparser_Token.Keyword"] else 0,
    axis=1,
    meta=('enable_ok', 'int'))
excluding_go_and_enable = excluding_go_and_enable[excluding_go.enable_ok == 0]

excluding_go_and_enable['file_id'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 74.55 s


350177

In [102]:
# to pandas
excluding_go_and_enable_pd = excluding_go_and_enable.compute(scheduler='processes')

[########################################] | 100% Completed | 79.43 s


In [105]:
# keywords
getting_keywords = excluding_go_and_enable_pd['sqlparser_Token.Keyword']
from collections import Counter
kword_counter = Counter()
for lst in getting_keywords:
    for item in lst:
        kword_counter.update({item:1})
kword_counter.most_common(20)

[('TABLE', 110326),
 ('NOT NULL', 74341),
 ('KEY', 72692),
 ('PRIMARY', 67804),
 ('FROM', 58200),
 ('REFERENCES', 44037),
 ('FOREIGN', 43712),
 ('table', 42587),
 ('CONSTRAINT', 42327),
 ('ON', 38325),
 ('DEFAULT', 35908),
 ('INTO', 35772),
 ('NULL', 31757),
 ('WHERE', 30245),
 ('BEGIN', 28868),
 ('SET', 28840),
 ('DELIMITER', 28787),
 ('VALUES', 28091),
 ('END', 27430),
 ('IF', 26057)]

In [109]:
test_values = excluding_go_and_enable_pd
test_values['values_ok'] = test_values.apply(
    lambda row:
    1 if "VALUES" in row["sqlparser_Token.Keyword"] else 0,
    axis=1)
test_values = test_values[test_values.values_ok == 1]

# Going back to file level - reducing amount of files using stmt level

In [20]:
not_parsable_tidb_and_jsql_hard['file_id'].count()

# have 35 634 difficult files -> trying to see statement counts at least

35634

In [21]:
stmt__for_file_level_join = stmt_parsed_ddf

stmt__for_file_level_join['parsed_by_something'] = stmt__for_file_level_join.apply(
    lambda row:
    1 if row['parsed_by_number'] > 0 else 0,
    axis=1,
    meta=('parsed_by_something', 'int'))

stmt__for_file_level_join = stmt__for_file_level_join[['file_id','statement_nr','parsed_by_something']]

stmt__for_file_level_join_pd = stmt__for_file_level_join.groupby(['file_id','parsed_by_something'])['statement_nr'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 153.44 s


In [22]:
stmt__for_file_level_join_pd = stmt__for_file_level_join_pd.reset_index()

In [150]:
stmt__for_file_level_join_pd.count()

file_id                419711
parsed_by_something    419711
statement_nr           419711
dtype: int64

In [23]:
merged_file_stmt_info = not_parsable_tidb_and_jsql_hard.merge(
    stmt__for_file_level_join_pd,
    on=['file_id'],how='left'
)

In [24]:
merged_file_stmt_info.groupby('parsed_by_something')['file_id'].count()

parsed_by_something
0    30317
1    31668
Name: file_id, dtype: int64

In [25]:
import warnings
warnings.filterwarnings("ignore", 'Boolean inputs to the `inclusive` argument are deprecated in favour of `both` or `neither`.')

not_parsed_merged_stmt_info = merged_file_stmt_info[merged_file_stmt_info.parsed_by_something == 0]
not_parsed_merged_stmt_info['unparsed_stmts_bucket'] = np.select(
    [
        not_parsed_merged_stmt_info['statement_nr'].between(0, 1, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(2, 3, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(4, 5, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(6, 10, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(11, 20, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(21, 45, inclusive=True),
        not_parsed_merged_stmt_info['statement_nr'].between(46, 1000, inclusive=True)
    ], 
    [
        1,2,3,4,5,6,7
    ], 
    default=8
)
not_parsed_merged_stmt_info.groupby('unparsed_stmts_bucket')['file_id'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_parsed_merged_stmt_info['unparsed_stmts_bucket'] = np.select(


unparsed_stmts_bucket
1    12941
2    6611 
3    2762 
4    3029 
5    2429 
6    1300 
7    1235 
8    10   
Name: file_id, dtype: int64

In [37]:
not_parsed_merged_stmt_info_relevant_cols = not_parsed_merged_stmt_info.copy()
not_parsed_merged_stmt_info_relevant_cols = not_parsed_merged_stmt_info_relevant_cols[
    ['file_id','statement_nr','pglast_file_parse_error','pglast_original_highlight',
    'sqlglot_parse_error','sqlglot_file_parse_error_start_context','sqlglot_file_parse_error_highlight',
    'tidb_file_parse_error','rustparser_parse_error','jsql_file_parse_error','jsql_file_parse_error_with_brackets']
]
not_parsed_merged_stmt_info_relevant_cols['pglast_file_parse_error'] = not_parsed_merged_stmt_info_relevant_cols['pglast_file_parse_error'].str.slice(0,50)
not_parsed_merged_stmt_info_relevant_cols['pglast_original_highlight'] = not_parsed_merged_stmt_info_relevant_cols['pglast_original_highlight'].str.slice(0,50)

not_parsed_merged_stmt_info_relevant_cols['sqlglot_parse_error'] = not_parsed_merged_stmt_info_relevant_cols['sqlglot_parse_error'].str.slice(0,50)
not_parsed_merged_stmt_info_relevant_cols['sqlglot_file_parse_error_start_context'] = not_parsed_merged_stmt_info_relevant_cols['sqlglot_file_parse_error_start_context'].str.slice(0,50)
not_parsed_merged_stmt_info_relevant_cols['sqlglot_file_parse_error_highlight'] = not_parsed_merged_stmt_info_relevant_cols['sqlglot_file_parse_error_highlight'].str.slice(0,50)

not_parsed_merged_stmt_info_relevant_cols['tidb_file_parse_error'] = not_parsed_merged_stmt_info_relevant_cols['tidb_file_parse_error'].str.slice(0,50)
not_parsed_merged_stmt_info_relevant_cols['rustparser_parse_error'] = not_parsed_merged_stmt_info_relevant_cols['rustparser_parse_error'].str.slice(0,150)


not_parsed_merged_stmt_info_relevant_cols['jsql_file_parse_error'] = not_parsed_merged_stmt_info_relevant_cols['jsql_file_parse_error'].str.slice(0,80)
not_parsed_merged_stmt_info_relevant_cols['jsql_file_parse_error_with_brackets'] = not_parsed_merged_stmt_info_relevant_cols['jsql_file_parse_error_with_brackets'].str.slice(0,80)

In [38]:
not_parsed_merged_stmt_info_relevant_cols.head()

Unnamed: 0,file_id,statement_nr,pglast_file_parse_error,pglast_original_highlight,sqlglot_parse_error,sqlglot_file_parse_error_start_context,sqlglot_file_parse_error_highlight,tidb_file_parse_error,rustparser_parse_error,jsql_file_parse_error,jsql_file_parse_error_with_brackets
2,622076,1,"syntax error at or near ""IDENTITY"", at index 46","IDENTITY(1,1),\n Name NVARCHAR(255) NOT NULL\n);",Expecting (|Expecting )|Invalid expression / Unexp,"PRIMARY KEY IDENTITY(1,1),\n Title NVARCHAR(255",REFERENCES|Authors|,"line 2 column 32 near ""IDENTITY(1,1),\n Name NVA","PARSER_ERROR|Query parsing failed.\n\tsql parser error: Expected ',' or ')' after column definition, found: IDENTITY","Encountered unexpected token: ""INSERT"" ""INSERT""\n at line 15, column 1.\n\nWas e","Encountered unexpected token: ""INSERT"" ""INSERT""\n at line 15, column 1.\n\nWas e"
3,396373,1,"syntax error at or near ""TRAN"", at index 45","TRAN\nDECLARE @nInterCo\t\t\t\t\tINT\n,@stredfPrecision\t",Invalid expression / Unexpected token|,---spEXPExpenseSheetHdrCreateGLJournal\nBEGIN TRAN\n,@|,"line 1 column 1 near ""---spEXPExpenseSheetHdrCreat","PARSER_ERROR|Query parsing failed.\n\tsql parser error: Expected end of statement, found: TRAN","Encountered unexpected token: ""TRAN"" <S_IDENTIFIER>\n at line 2, column 7.\n\nWa","Encountered unexpected token: ""TRAN"" <S_IDENTIFIER>\n at line 2, column 7.\n\nWa"
5,581137,1,"syntax error at or near ""if"", at index 267",==*/\n\n\nif exists (select 1\n from sys.sysreferenc,Required keyword: 'true' missing for <class 'sqlgl,keyid = object_id('Textbook.ApprovalDeclaration'),alter|,"line 7 column 3 near ""if exists (select 1\n from","PARSER_ERROR|Query parsing failed.\n\tsql parser error: Expected an SQL statement, found: if","Encountered unexpected token: ""go"" <S_IDENTIFIER>\n at line 12, column 1.\n\nWas","Encountered unexpected token: ""go"" <S_IDENTIFIER>\n at line 12, column 1.\n\nWas"
7,666969,55,"syntax error at or near ""OFF"", at index 416",OFF;\nGO\nUSE [ffstats];\nGO\nIF SCHEMA_ID(N'dbo') IS,Invalid expression / Unexpected token|,StatsData.edmx\n-- --------------------------------,USE|,"line 9 column 26 near ""OFF;\nGO\nUSE [ffstats];\nGO\nI","PARSER_ERROR|Query parsing failed.\n\tsql parser error: Expected equals sign or TO, found: OFF","Encountered unexpected token: ""OFF"" ""OFF""\n at line 9, column 23.\n\nWas expecti","Encountered unexpected token: ""OFF"" ""OFF""\n at line 9, column 23.\n\nWas expecti"
9,59263,1,"syntax error at or near ""/"", at index 19",y/\nDROP VIEW v_author/\nDROP VIEW v_book/\n\nDROP TRI,Invalid expression / Unexpected token|,DROP VIEW v_library|,/|,"line 1 column 20 near ""/\nDROP VIEW v_author/\nDROP","PARSER_ERROR|Query parsing failed.\n\tsql parser error: Expected end of statement, found: /","Encountered unexpected token: ""/"" ""/""\n at line 1, column 20.\n\nWas expecting o","Encountered unexpected token: ""/"" ""/""\n at line 1, column 20.\n\nWas expecting o"
