In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style("ticks")

import os
import sys
sys.path.insert(0,'..')

import pandas as pd
import json
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import make_meta

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from src_new.utils import SQLParserSchema, PGLastSchema, get_file_encodings, SQLGlotSchema, SimpleDDLParserSchema

from dask.diagnostics import ProgressBar
ProgressBar().register()

def read_partial(parser, schema_type, cols):
    _outdir = '../../data'

    if parser == 'sqlparser':
        p = SQLParserSchema()
        _outdir += 'sqlparser'
    elif parser == 'pglast':
        p = PGLastSchema()
        _outdir += 'pglast'
    elif parser == 'sqlglot':
        p = SQLGlotSchema()
        _outdir += 'sqlglot'
    elif parser == 'simple':
        p = SimpleDDLParserSchema()
        _outdir += 'simpleddlparser'
    
    if schema_type == 'file':
        _schema = p.file_level_schema
        _outdir += '/'
    elif schema_type == 'stmt':
        _schema = p.statement_list_sch
        _outdir += '_details/'
    
    ddf = dd.read_parquet(_outdir,columns=cols, schema=_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')
    return ddf

# Read file details
fd = []
for file in os.listdir('../../data/filedetails/'):
        full_filename = "%s/%s" % ('../../data/filedetails/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            for item in dict:
                  fd.append(item)

filedetails_df = pd.DataFrame.from_dict(fd)
del fd

In [129]:
# read chatgpt

cols = ['file_id','error','finish_reason','orig_response','used_tokens', 'table_list', 'column_list', 'schema_list', 'db_list',
        'view_list', 'num_ctr_notnull', 'num_ctr_unique', 'num_ctr_primary', 'num_ctr_foreign']

# Read file details
fd = []
for file in os.listdir('../../data/chatgpt/'):
        full_filename = "%s/%s" % ('../../data//chatgpt/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            fd.append(dict)

chatgpt_df = pd.DataFrame.from_dict(fd)
del fd

chatgpt_df.count()

file_id            2441
error              2441
finish_reason      2441
orig_response      2441
used_tokens        2441
table_list         2441
column_list        2441
schema_list        2441
db_list            2441
view_list          2441
num_ctr_notnull    2441
num_ctr_unique     2441
num_ctr_primary    2441
num_ctr_foreign    2441
dtype: int64

In [130]:
chatgpt_df.groupby('error')['file_id'].count()

error
0    2435
1    5   
2    1   
Name: file_id, dtype: int64

In [131]:
pglast_ddf = read_partial('pglast','file',PGLastSchema().file_level_schema.names)

import ast
pglast_ddf['x'] = pglast_ddf.apply(
    lambda row:
    #'a',
    [] if row['counter_str'] is None else [item for item in ast.literal_eval(row['counter_str'] or 'None') if 'CreateSeqStmt' in item],
    axis=1,
    meta=('x', 'string'))

pglast_ddf['num_create_seq'] = pglast_ddf.apply(
    lambda row:
    row['x'][0]['CreateSeqStmt'] if len(row['x']) > 0 else 0,
    axis=1,
    meta=('num_create_seq', 'float'))

pglast_ddf = pglast_ddf.drop(columns=['x'])  

pglast_ddf['num_tables_without_create_seq'] = pglast_ddf['num_distinct_tables'] - pglast_ddf['num_create_seq']

pglast_ddf['table_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['table_list']])) if row['table_list'] is not None else None,
    axis=1,
    meta=('table_list_str', 'string'))
pglast_ddf['column_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['columns_list']])) if row['columns_list'] is not None else None,
    axis=1,
    meta=('column_list_str', 'string'))

pglast_ddf['view_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['view_list']])) if row['view_list'] is not None else None,
    axis=1,
    meta=('view_list_str', 'string'))
pglast_ddf['sch_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['schema_list']])) if row['schema_list'] is not None else None,
    axis=1,
    meta=('sch_list_str', 'string'))
pglast_ddf['db_list_str'] = pglast_ddf.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['db_list']])) if row['db_list'] is not None else None,
    axis=1,
    meta=('db_list_str', 'string'))

In [132]:
chatgpt_df['num_tables'] = chatgpt_df.apply(
    lambda row:
    len(row['table_list']) if row['table_list'] is not None else 0,
    axis=1)

chatgpt_df['num_cols'] = chatgpt_df.apply(
    lambda row:
    len(row['column_list']) if row['column_list'] is not None else 0,
    axis=1)

chatgpt_df['num_dbs'] = chatgpt_df.apply(
    lambda row:
    len(row['db_list']) if row['db_list'] is not None else 0,
    axis=1)

chatgpt_df['num_schs'] = chatgpt_df.apply(
    lambda row:
    len(row['schema_list']) if row['schema_list'] is not None else 0,
    axis=1)

chatgpt_df['num_vws'] = chatgpt_df.apply(
    lambda row:
    len(row['view_list']) if row['view_list'] is not None else 0,
    axis=1)

In [133]:
chatgpt_df['table_list_str'] = chatgpt_df.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['table_list']])) if row['table_list'] is not None else None,
    axis=1)
chatgpt_df['column_list_str'] = chatgpt_df.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['column_list']])) if row['column_list'] is not None else None,
    axis=1)

chatgpt_df['view_list_str'] = chatgpt_df.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['view_list']])) if row['view_list'] is not None else None,
    axis=1)

chatgpt_df['db_list_str'] = chatgpt_df.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['db_list']])) if row['db_list'] is not None else None,
    axis=1)

chatgpt_df['schema_list_str'] = chatgpt_df.apply(
    lambda row:
    "|".join(str(x) for x in sorted([item.upper() for item in row['schema_list']])) if row['schema_list'] is not None else None,
    axis=1)

In [None]:
chatgpt_df = chatgpt_df[chatgpt_df.error == 0]

pglast_joined_chatgpt= dd.merge(
    pglast_ddf,
    chatgpt_df,
    on=['file_id'],
    how='inner',indicator=False,suffixes=['_pglast','_chatgpt']).compute()

In [135]:
subset = pglast_joined_chatgpt[
    [   "file_id", "num_create_seq", # for pglast
        "num_tables_without_create_seq", "num_distinct_tables", # for pglast
        "num_tables", # for chatgpt
        "num_distinct_columns","num_distinct_schemas","num_distinct_dbs",
        "num_cols","num_dbs","num_schs","num_vws",
        "table_list_chatgpt","table_list_str_chatgpt", "table_list_str_pglast",
        "column_list_str_chatgpt", "column_list_str_pglast",
        "view_list_str_chatgpt", "view_list_str_pglast",
        "db_list_str_chatgpt", "db_list_str_pglast",
        "schema_list_str", "sch_list_str", # for pglast on the right
        "num_ctr_notnull_chatgpt","num_ctr_notnull_pglast",
        "num_ctr_unique_chatgpt","num_ctr_unique_pglast",
        "num_ctr_primary_chatgpt","num_ctr_primary_pglast",
        "num_ctr_foreign_chatgpt","num_ctr_foreign_pglast"
    ]
]


In [153]:
subset.count()

file_id                          2435
num_create_seq                   2435
num_tables_without_create_seq    2435
num_distinct_tables              2435
num_tables                       2435
num_distinct_columns             2435
num_distinct_schemas             2435
num_distinct_dbs                 2435
num_cols                         2435
num_dbs                          2435
num_schs                         2435
num_vws                          2435
table_list_chatgpt               2435
table_list_str_chatgpt           2435
table_list_str_pglast            2433
column_list_str_chatgpt          2435
column_list_str_pglast           2342
view_list_str_chatgpt            2435
view_list_str_pglast             54  
db_list_str_chatgpt              2435
db_list_str_pglast               43  
schema_list_str                  2435
sch_list_str                     1218
num_ctr_notnull_chatgpt          2435
num_ctr_notnull_pglast           2435
num_ctr_unique_chatgpt           2435
num_ctr_uniq

In [136]:
for c in subset.columns:
    if subset[c].dtype == int:
        subset[c] = subset[c].astype(float)

In [137]:
subset[
    (subset.num_distinct_tables == 0) |
    (subset.num_tables_without_create_seq == subset.num_tables)
]['file_id'].count()

# no create seq -> 1337
# with create seq 11 96

1337

In [138]:
subset[
    #(subset.num_distinct_columns == 0) |
    (subset.num_distinct_columns == subset.num_cols)
]['file_id'].count()

1376

In [139]:
subset[
    #(subset.num_distinct_schemas == 0) |
    (subset.num_distinct_schemas == subset.num_schs)
]['file_id'].count()

1564

In [140]:
subset[
    #(subset.num_distinct_schemas == 0) |
    (subset.num_distinct_dbs == subset.num_dbs)
]['file_id'].count()

732

In [141]:
subset[
    (subset.num_vws == 0) |
    (subset.view_list_str_pglast == subset.view_list_str_chatgpt)
]['file_id'].count()

2368

In [142]:
subset[
    (subset.num_distinct_tables == 0) |
    (subset.table_list_str_chatgpt == subset.table_list_str_pglast)
]['file_id'].count()

1131

In [143]:
subset[
    (subset.num_distinct_columns == 0) |
    (subset.column_list_str_chatgpt == subset.column_list_str_pglast)
]['file_id'].count()

1443

In [144]:
subset[
    (subset.num_distinct_dbs == 0) |
    (subset.db_list_str_chatgpt == subset.db_list_str_pglast)
]['file_id'].count()

2434

In [145]:
subset[
    (subset.num_distinct_schemas == 0) |
    (subset.schema_list_str == subset.sch_list_str)
]['file_id'].count()

1782

In [146]:
subset[
    #(subset.num_ctr_notnull_pglast == 0) |
    (subset.num_ctr_notnull_chatgpt == subset.num_ctr_notnull_pglast)
]['file_id'].count()

641

In [152]:
subset[
    #(subset.num_ctr_notnull_pglast == 0) |
    (subset.num_ctr_notnull_chatgpt != subset.num_ctr_notnull_pglast)
][['file_id','num_ctr_notnull_chatgpt','num_ctr_notnull_pglast']].head()

Unnamed: 0,file_id,num_ctr_notnull_chatgpt,num_ctr_notnull_pglast
1,510377,13,11.0
2,511137,4,3.0
3,511246,1,3.0
4,511567,10,7.0
5,511967,0,19.0


In [147]:
subset[
    #(subset.num_ctr_unique_pglast == 0) |
    (subset.num_ctr_unique_chatgpt == subset.num_ctr_unique_pglast)
]['file_id'].count()

1633

In [148]:
subset[
    #(subset.num_ctr_primary_pglast == 0) |
    (subset.num_ctr_primary_chatgpt == subset.num_ctr_primary_pglast)
]['file_id'].count()

1558

In [149]:
subset[
    #(subset.num_ctr_foreign_pglast == 0) |
    (subset.num_ctr_foreign_chatgpt == subset.num_ctr_foreign_pglast)
]['file_id'].count()

1715