# Imports and file details

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style("ticks")

import os
import sys
sys.path.insert(0,'..')

import pandas as pd
import json
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import make_meta

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from src_new.utils import SQLParserSchema, PGLastSchema, get_file_encodings, SQLGlotSchema, SimpleDDLParserSchema,TIDBMysqlSchema

from dask.diagnostics import ProgressBar
ProgressBar().register()

  pd.set_option('max_colwidth', -1)


In [2]:
def read_partial(parser, schema_type, cols):
    _outdir = '../out_new/'

    if parser == 'sqlparser':
        p = SQLParserSchema()
        _outdir += 'sqlparser'
    elif parser == 'pglast':
        p = PGLastSchema()
        _outdir += 'pglast'
    elif parser == 'sqlglot':
        p = SQLGlotSchema()
        _outdir += 'sqlglot'
    elif parser == 'simple':
        p = SimpleDDLParserSchema()
        _outdir += 'simpleddlparser'
    elif parser == 'tidb':
        p = TIDBMysqlSchema()
        _outdir += 'tidb_mysql'
    
    if schema_type == 'file':
        _schema = p.file_level_schema
        _outdir += '/'
    elif schema_type == 'stmt':
        _schema = p.statement_list_sch
        _outdir += '_details/'
    
    ddf = dd.read_parquet(_outdir,columns=cols, schema=_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')
    return ddf

In [3]:
# Read file details
fd = []
for file in os.listdir('../out_new/filedetails/'):
        full_filename = "%s/%s" % ('../out_new/filedetails/', file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            for item in dict:
                  fd.append(item)

filedetails_df = pd.DataFrame.from_dict(fd)
del fd

In [4]:
pglast_ddf = read_partial('pglast','file',PGLastSchema().file_level_schema.names)
pglast_details_ddf = read_partial('pglast','stmt',PGLastSchema().statement_list_sch.names)

In [5]:
pglast_details_ddf.groupby('parsed')['parsed'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 3.01 ss


parsed
0    13573275
1    11054231
Name: parsed, dtype: int64

In [6]:
parsed = pglast_details_ddf[pglast_details_ddf.parsed == 1]
parsed['file_id'].nunique().compute(scheduler='processes')

[########################################] | 100% Completed | 4.00 ss


295066

In [7]:
not_parsed = pglast_details_ddf[pglast_details_ddf.parsed == 0]
not_parsed['file_id'].nunique().compute(scheduler='processes')

[########################################] | 100% Completed | 4.12 ss


309224

# Err analysis at stmt level

In [8]:
def get_stmt_and_file_count(dask_df):
    x = dask_df['file_id'].nunique().compute(scheduler='processes')
    y = dask_df['file_id'].count().compute(scheduler='processes')
    return y,x

## Cutting down
SCHEMA HERE

In [9]:
not_parsed[
    (not_parsed.parse_error.str.contains("syntax error at or near",case=False,regex=False))
]['file_id'].count().compute(scheduler='processes')

[########################################] | 100% Completed | 31.60 s


13515500

In [10]:
not_parsed = not_parsed[
    ['file_id','statement_nr','parse_error','original','original_highlight']
]

In [11]:
no_syntax_err = not_parsed[
    ~(not_parsed.parse_error.str.contains("syntax error at or near",case=False,regex=False))
]

In [12]:
not_parsed['trunc_err'] = not_parsed.apply(
    lambda row:
    row['parse_error'].split('syntax error at or near')[1].strip().split(',')[0].strip() if "syntax error at or near" in row['parse_error'] else row['parse_error'][0:70], 
    axis=1,
    meta=('trunc_err', 'string'))

In [13]:
not_parsed_schema = pa.schema([])
not_parsed_schema = not_parsed_schema.append(pa.field("file_id",pa.string()))
not_parsed_schema = not_parsed_schema.append(pa.field("statement_nr",pa.int64()))
not_parsed_schema = not_parsed_schema.append(pa.field("trunc_err",pa.string()))
not_parsed_schema = not_parsed_schema.append(pa.field("parse_error",pa.string()))
not_parsed_schema = not_parsed_schema.append(pa.field("original_highlight",pa.string()))
not_parsed_schema = not_parsed_schema.append(pa.field("original",pa.string()))

In [14]:
not_parsed.to_parquet('../out_new/pglast_details_not_parsed_only/',schema=not_parsed_schema,overwrite=True,
                      compute_kwargs={'scheduler':'processes'})

[########################################] | 100% Completed | 84.82 s


## Reading only not parsed

In [15]:
not_parsed_trunc = dd.read_parquet('../out_new/pglast_details_not_parsed_only/', columns = not_parsed_schema.names ,schema=not_parsed_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

In [16]:
print(get_stmt_and_file_count(not_parsed_trunc))

[########################################] | 100% Completed | 4.63 ss
[########################################] | 100% Completed | 4.01 ss
(13573275, 309224)


In [18]:
errcounts = not_parsed_trunc['trunc_err'].value_counts().compute(scheduler='processes')
err_df = pd.DataFrame({'err':errcounts.index, 'cnt':errcounts.values})

[########################################] | 100% Completed | 4.89 ss


### MYSQL separator

In [20]:
mysql_check = not_parsed_trunc[
    (not_parsed_trunc.trunc_err == '"`"')
]
print(get_stmt_and_file_count(mysql_check))

[########################################] | 100% Completed | 5.28 ss
[########################################] | 100% Completed | 4.56 ss
(11007489, 163161)


In [21]:
mysql_check.head()

[########################################] | 100% Completed | 409.01 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
0,503958,0,"""`""","syntax error at or near ""`"", at index 15","ABLE `sys_menu` (\n `menu_id` bigint NOT NULL AUTO_INCREMENT,\n `parent_id` bigint COMMENT '父菜单ID，一级菜单为0',\n `name` varchar(50) COMMENT '菜单名称',\n `url` varchar(200) COMMENT '菜单URL',\n `perms` varchar(500) COMMENT '授权(多个用逗号分隔，如：user:list,user:create)',\n `type` int COMMENT '类型 0：目录 1：菜单 2：按钮',\n `icon` varchar(50) COMMENT '菜单图标',\n `order_num` int COMMENT '排序',\n PRIMARY KEY (`menu_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='菜单管理';","-- 菜单\nCREATE TABLE `sys_menu` (\n `menu_id` bigint NOT NULL AUTO_INCREMENT,\n `parent_id` bigint COMMENT '父菜单ID，一级菜单为0',\n `name` varchar(50) COMMENT '菜单名称',\n `url` varchar(200) COMMENT '菜单URL',\n `perms` varchar(500) COMMENT '授权(多个用逗号分隔，如：user:list,user:create)',\n `type` int COMMENT '类型 0：目录 1：菜单 2：按钮',\n `icon` varchar(50) COMMENT '菜单图标',\n `order_num` int COMMENT '排序',\n PRIMARY KEY (`menu_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='菜单管理';"
1,503958,1,"""`""","syntax error at or near ""`"", at index 15","ABLE `sys_dept` (\n `dept_id` bigint NOT NULL AUTO_INCREMENT,\n `parent_id` bigint COMMENT '上级部门ID，一级部门为0',\n `name` varchar(50) COMMENT '部门名称',\n `order_num` int COMMENT '排序',\n `del_flag` tinyint DEFAULT 0 COMMENT '是否删除 -1：已删除 0：正常',\n PRIMARY KEY (`dept_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='部门管理';","-- 部门\nCREATE TABLE `sys_dept` (\n `dept_id` bigint NOT NULL AUTO_INCREMENT,\n `parent_id` bigint COMMENT '上级部门ID，一级部门为0',\n `name` varchar(50) COMMENT '部门名称',\n `order_num` int COMMENT '排序',\n `del_flag` tinyint DEFAULT 0 COMMENT '是否删除 -1：已删除 0：正常',\n PRIMARY KEY (`dept_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='部门管理';"
2,503958,2,"""`""","syntax error at or near ""`"", at index 13","TE TABLE `sys_user` (\n `user_id` bigint NOT NULL AUTO_INCREMENT,\n `username` varchar(50) NOT NULL COMMENT '用户名',\n `password` varchar(100) COMMENT '密码',\n `salt` varchar(20) COMMENT '盐',\n `email` varchar(100) COMMENT '邮箱',\n `mobile` varchar(100) COMMENT '手机号',\n `status` tinyint COMMENT '状态 0：禁用 1：正常',\n `dept_id` bigint(20) COMMENT '部门ID',\n `create_time` datetime COMMENT '创建时间',\n PRIMARY KEY (`user_id`),\n UNIQUE INDEX (`username`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='系统用户';","-- 系统用户\nCREATE TABLE `sys_user` (\n `user_id` bigint NOT NULL AUTO_INCREMENT,\n `username` varchar(50) NOT NULL COMMENT '用户名',\n `password` varchar(100) COMMENT '密码',\n `salt` varchar(20) COMMENT '盐',\n `email` varchar(100) COMMENT '邮箱',\n `mobile` varchar(100) COMMENT '手机号',\n `status` tinyint COMMENT '状态 0：禁用 1：正常',\n `dept_id` bigint(20) COMMENT '部门ID',\n `create_time` datetime COMMENT '创建时间',\n PRIMARY KEY (`user_id`),\n UNIQUE INDEX (`username`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='系统用户';"
3,503958,3,"""`""","syntax error at or near ""`"", at index 18","TE TABLE `sys_user_token` (\n `user_id` bigint(20) NOT NULL,\n `token` varchar(100) NOT NULL COMMENT 'token',\n `expire_time` datetime DEFAULT NULL COMMENT '过期时间',\n `update_time` datetime DEFAULT NULL COMMENT '更新时间',\n PRIMARY KEY (`user_id`),\n UNIQUE KEY `token` (`token`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='系统用户Token';","-- 系统用户Token\nCREATE TABLE `sys_user_token` (\n `user_id` bigint(20) NOT NULL,\n `token` varchar(100) NOT NULL COMMENT 'token',\n `expire_time` datetime DEFAULT NULL COMMENT '过期时间',\n `update_time` datetime DEFAULT NULL COMMENT '更新时间',\n PRIMARY KEY (`user_id`),\n UNIQUE KEY `token` (`token`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='系统用户Token';"
4,503958,4,"""`""","syntax error at or near ""`"", at index 15","ABLE `sys_role` (\n `role_id` bigint NOT NULL AUTO_INCREMENT,\n `role_name` varchar(100) COMMENT '角色名称',\n `remark` varchar(100) COMMENT '备注',\n `dept_id` bigint(20) COMMENT '部门ID',\n `create_time` datetime COMMENT '创建时间',\n PRIMARY KEY (`role_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='角色';","-- 角色\nCREATE TABLE `sys_role` (\n `role_id` bigint NOT NULL AUTO_INCREMENT,\n `role_name` varchar(100) COMMENT '角色名称',\n `remark` varchar(100) COMMENT '备注',\n `dept_id` bigint(20) COMMENT '部门ID',\n `create_time` datetime COMMENT '创建时间',\n PRIMARY KEY (`role_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='角色';"


### Writing down without mysql

In [22]:
no_mysql = not_parsed_trunc[
    ~(not_parsed_trunc.trunc_err == '"`"')
]

In [23]:
no_mysql.to_parquet('../out_new/pglast_details_not_parsed_only_without_mysql/',schema=not_parsed_schema,overwrite=True,
                      compute_kwargs={'scheduler':'processes'})

[########################################] | 100% Completed | 20.39 s


## Reading without mysql

In [24]:
not_parsed_no_mysql = dd.read_parquet('../out_new/pglast_details_not_parsed_only_without_mysql/', columns = not_parsed_schema.names ,schema=not_parsed_schema, split_row_groups=True, calculate_divisions=True,engine='pyarrow')

In [25]:
print(get_stmt_and_file_count(not_parsed_no_mysql))

[########################################] | 100% Completed | 4.04 ss
[########################################] | 100% Completed | 5.83 ss
(2565786, 247094)


In [26]:
errcounts = not_parsed_no_mysql['trunc_err'].value_counts().compute(scheduler='processes')
err_df = pd.DataFrame({'err':errcounts.index, 'cnt':errcounts.values})

[########################################] | 100% Completed | 5.37 ss


### UNLOCK

In [28]:
unlock_check = not_parsed_no_mysql[
    (not_parsed_no_mysql.trunc_err.str.contains('UNLOCK',case=False,regex=False))
]
print(get_stmt_and_file_count(unlock_check))

[########################################] | 100% Completed | 4.48 ss
[########################################] | 100% Completed | 4.95 ss
(450253, 46582)


In [29]:
unlock_check.head()

[########################################] | 100% Completed | 115.06 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
99,503990,19,"""UNLOCK""","syntax error at or near ""UNLOCK"", at index 0",;,UNLOCK TABLES;
104,504003,18,"""UNLOCK""","syntax error at or near ""UNLOCK"", at index 0",;,UNLOCK TABLES;
105,504003,28,"""UNLOCK""","syntax error at or near ""UNLOCK"", at index 0",;,UNLOCK TABLES;
106,504003,38,"""UNLOCK""","syntax error at or near ""UNLOCK"", at index 0",;,UNLOCK TABLES;
107,504003,48,"""UNLOCK""","syntax error at or near ""UNLOCK"", at index 0",;,UNLOCK TABLES;


### HASHTAG

In [32]:
no_unlock = not_parsed_no_mysql[
    ~(not_parsed_no_mysql.trunc_err.str.contains('UNLOCK',case=False,regex=False))
]
hashtag_check = no_unlock[
    (no_unlock.trunc_err == '"#"')
]
print(get_stmt_and_file_count(hashtag_check))

[########################################] | 100% Completed | 5.93 ss
[########################################] | 100% Completed | 4.98 ss
(149320, 14357)


In [34]:
hashtag_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 113.61 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
11722,507553,13,"""#""","syntax error at or near ""#"", at index 0",;,# Dump of table company_oauth\n# ------------------------------------------------------------\n\nDROP TABLE IF EXISTS `company_oauth`;
5879,506212,72,"""#""","syntax error at or near ""#"", at index 0",;,"#\n# Litter table new columns and allow birthdate to be null\n#\nALTER TABLE `Litter` ADD COLUMN `_litterType_key` INT(11) NOT NULL AFTER `version` , \nADD COLUMN `harvestDate` DATE NULL DEFAULT NULL AFTER `_litterType_key`,\nADD COLUMN `numberHarvested` INT(11) NULL DEFAULT NULL AFTER `harvestDate` ;"
3650,505399,119,"""#""","syntax error at or near ""#"", at index 0",;,"#\n# Table structure of table `tecno_world_woocommerce_tax_rates`\n#\n\nCREATE TABLE `tecno_world_woocommerce_tax_rates` (\n `tax_rate_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,\n `tax_rate_country` varchar(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n `tax_rate_state` varchar(200) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n `tax_rate` varchar(8) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n `tax_rate_name` varchar(200) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n `tax_rate_priority` bigint(20) unsigned NOT NULL,\n `tax_rate_compound` int(1) NOT NULL DEFAULT 0,\n `tax_rate_shipping` int(1) NOT NULL DEFAULT 1,\n `tax_rate_order` bigint(20) unsigned NOT NULL,\n `tax_rate_class` varchar(200) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n PRIMARY KEY (`tax_rate_id`),"
3595,505399,64,"""#""","syntax error at or near ""#"", at index 0",;,"#\n# Data contents of table `tecno_world_wc_admin_note_actions`\n#\nINSERT INTO `tecno_world_wc_admin_note_actions` ( `action_id`, `note_id`, `name`, `label`, `query`, `status`, `is_primary`) VALUES\n(1, 1, 'learn-more', 'Learn more', 'https://woocommerce.wordpress.com/', 'actioned', 0),\n(2, 2, 'open-customizer', 'Open Customizer', 'customize.php', 'actioned', 0),\n(3, 3, 'connect', 'Connect', '?page=wc-addons&section=helper', 'actioned', 0) ;"
15237,509010,10,"""#""","syntax error at or near ""#"", at index 0",;,#Remover uma coluna em aluno\nALTER TABLE aluno DROP cpf;


### (

In [35]:
no_hashtag = no_unlock[
    ~(no_unlock.trunc_err == '"#"')
]
parant_check = no_hashtag[
    (no_hashtag.trunc_err == '"("')
]
print(get_stmt_and_file_count(parant_check))

[########################################] | 100% Completed | 6.77 ss
[########################################] | 100% Completed | 6.87 ss
(127732, 18756)


In [48]:
int_parant_check = no_hashtag[
    (no_hashtag.trunc_err == '"("') &
    (
        (
            (no_hashtag.original.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('integer\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INTEGER\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('UNSIGNED',regex=True,case=False))
        )
    )   
]
print(get_stmt_and_file_count(int_parant_check))

[########################################] | 100% Completed | 17.18 s
[########################################] | 100% Completed | 22.27 s
(99984, 14276)


In [49]:
index_in_create_table_check = no_hashtag[
    (no_hashtag.trunc_err == '"("') &
    ~(
        (
            (no_hashtag.original.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('integer\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INTEGER\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('UNSIGNED',regex=True,case=False))
        )
    ) &
    (
        (no_hashtag.original.str.contains('CREATE TABLE',regex=False,case=False)) &
        (no_hashtag.original.str.contains('INDEX',regex=False,case=False))
    )
]
print(get_stmt_and_file_count(index_in_create_table_check))

[########################################] | 100% Completed | 18.79 s
[########################################] | 100% Completed | 18.89 s
(2418, 1258)


In [50]:
alter_table_add_index_check = no_hashtag[
    (no_hashtag.trunc_err == '"("') &
    ~(
        (
            (no_hashtag.original.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('integer\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INTEGER\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('UNSIGNED',regex=True,case=False))
        )
    ) &
    ~(
        (no_hashtag.original.str.contains('CREATE TABLE',regex=False,case=False)) &
        (no_hashtag.original.str.contains('INDEX',regex=False,case=False))
    ) &
    (
        (no_hashtag.original.str.contains('alter table',regex=False,case=False)) &
        (no_hashtag.original.str.contains('add index',regex=False,case=False))
    )
]
print(get_stmt_and_file_count(alter_table_add_index_check))

[########################################] | 100% Completed | 19.78 s
[########################################] | 100% Completed | 19.63 s
(836, 235)


In [53]:
alter_table_other_check = no_hashtag[
    (no_hashtag.trunc_err == '"("') &
    ~(
        (
            (no_hashtag.original.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('int\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INT\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('integer\(\d+\)',regex=True,case=False)) |
            (no_hashtag.original_highlight.str.contains('INTEGER\(\d+\)',regex=True,case=False))
        ) |
        (
            (no_hashtag.original_highlight.str.contains('UNSIGNED',regex=True,case=False))
        )
    ) &
    ~(
        (no_hashtag.original.str.contains('CREATE TABLE',regex=False,case=False)) &
        (no_hashtag.original.str.contains('INDEX',regex=False,case=False))
    ) &
    ~(
        (no_hashtag.original.str.contains('alter table',regex=False,case=False)) &
        (no_hashtag.original.str.contains('add index',regex=False,case=False))
    ) &
    (
        (no_hashtag.original.str.contains('alter table',regex=False,case=False))
    )
]
print(get_stmt_and_file_count(alter_table_other_check))

[########################################] | 100% Completed | 18.07 s
[########################################] | 100% Completed | 19.15 s
(15910, 2267)


In [58]:
alter_table_other_check.sample(frac=0.6).head().head()

[########################################] | 100% Completed | 471.01 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
4137,505577,114,"""(""","syntax error at or near ""("", at index 48",(\n CONSTRAINT SE_REGISTRATIONS_EVENTS_FK02 \n FOREIGN KEY (REGISTRATION_ID) \n REFERENCES DB_CORE.SE_REGISTRATIONS (REGISTRATION_ID)\n ENABLE VALIDATE);,ALTER TABLE DB_CORE.SE_REGISTRATIONS_EVENTS ADD (\n CONSTRAINT SE_REGISTRATIONS_EVENTS_FK02 \n FOREIGN KEY (REGISTRATION_ID) \n REFERENCES DB_CORE.SE_REGISTRATIONS (REGISTRATION_ID)\n ENABLE VALIDATE);
1346,504557,19,"""(""","syntax error at or near ""("", at index 33","(\n CONSTRAINT FK854E14CE10A7F131 \n FOREIGN KEY (BANKLIBRARY_ID) \n REFERENCES KT.T_BANKLIBRARY (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE53BDD502 \n FOREIGN KEY (USER_ID) \n REFERENCES KT.SYS_USER (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE760A509C \n FOREIGN KEY (FKQRUSER_ID) \n REFERENCES KT.SYS_USER (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE8A7CBCAE \n FOREIGN KEY (ACCOUNT_ACCOUNTID_) \n REFERENCES KT.SYS_ACCOUNT (ACCOUNTID_)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CEC5A7F31","ALTER TABLE KT.T_ACCOUNTDEAL ADD (\n CONSTRAINT FK854E14CE10A7F131 \n FOREIGN KEY (BANKLIBRARY_ID) \n REFERENCES KT.T_BANKLIBRARY (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE53BDD502 \n FOREIGN KEY (USER_ID) \n REFERENCES KT.SYS_USER (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE760A509C \n FOREIGN KEY (FKQRUSER_ID) \n REFERENCES KT.SYS_USER (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CE8A7CBCAE \n FOREIGN KEY (ACCOUNT_ACCOUNTID_) \n REFERENCES KT.SYS_ACCOUNT (ACCOUNTID_)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CEC5A7F31 \n FOREIGN KEY (ACCOUNTDEAL_ID) \n REFERENCES KT.T_ACCOUNTDEAL (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CEECE7B6FA \n FOREIGN KEY (CHECKUSER_ID) \n REFERENCES KT.SYS_USER (ID)\n ENABLE VALIDATE,\n CONSTRAINT FK854E14CEFAA8AE9F \n FOREIGN KEY (HKUSER_ID) \n REFEREN"
4125,505577,102,"""(""","syntax error at or near ""("", at index 47",(\n CONSTRAINT SE_NOTIFICATIONS_USERS_FK01 \n FOREIGN KEY (NOTIFICATION_ID) \n REFERENCES DB_CORE.SE_NOTIFICATIONS (NOTIFICATION_ID)\n ENABLE VALIDATE);,ALTER TABLE DB_CORE.SE_NOTIFICATIONS_USERS ADD (\n CONSTRAINT SE_NOTIFICATIONS_USERS_FK01 \n FOREIGN KEY (NOTIFICATION_ID) \n REFERENCES DB_CORE.SE_NOTIFICATIONS (NOTIFICATION_ID)\n ENABLE VALIDATE);
13612,508376,32,"""(""","syntax error at or near ""("", at index 23",(telClient NUMBER(12) NOT NULL);,ALTER TABLE Client ADD (telClient NUMBER(12) NOT NULL);
4117,505577,94,"""(""","syntax error at or near ""("", at index 45",(\n CONSTRAINT SE_ROLES_PERMISSIONS_FK02 \n FOREIGN KEY (PERMISSION_ID) \n REFERENCES DB_CORE.SE_PERMISSIONS (PERMISSION_ID)\n ENABLE VALIDATE);,ALTER TABLE DB_CORE.SE_ROLES_PERMISSIONS ADD (\n CONSTRAINT SE_ROLES_PERMISSIONS_FK02 \n FOREIGN KEY (PERMISSION_ID) \n REFERENCES DB_CORE.SE_PERMISSIONS (PERMISSION_ID)\n ENABLE VALIDATE);


### AUTO_INCREMENT

In [63]:
no_parant = no_hashtag[
    ~(no_hashtag.trunc_err == '"("')
]
auto_incr_check = no_parant[
    (no_parant.trunc_err.str.contains("AUTO_INCREMENT",case=False,regex=False))
]
print(get_stmt_and_file_count(auto_incr_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 5.56 ss
[########################################] | 100% Completed | 5.79 ss
(180370, 39171)


In [60]:
auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 118.96 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
5405,506072,190,"""AUTO_INCREMENT""","syntax error at or near ""AUTO_INCREMENT"", at index 84","AUTO_INCREMENT,\n NAME VARCHAR(255) NOT NULL,\n DISPLAY_NAME VARCHAR(255) NOT NULL,\n DESCRIPTION VARCHAR(512),\n TENANT_ID INTEGER NOT NULL DEFAULT -1,\n PRIMARY KEY (SCOPE_ID)\n)ENGINE INNODB;","CREATE TABLE IF NOT EXISTS IDN_OAUTH2_SCOPE (\n SCOPE_ID INTEGER NOT NULL AUTO_INCREMENT,\n NAME VARCHAR(255) NOT NULL,\n DISPLAY_NAME VARCHAR(255) NOT NULL,\n DESCRIPTION VARCHAR(512),\n TENANT_ID INTEGER NOT NULL DEFAULT -1,\n PRIMARY KEY (SCOPE_ID)\n)ENGINE INNODB;"
15561,509135,0,"""AUTO_INCREMENT""","syntax error at or near ""AUTO_INCREMENT"", at index 58","AUTO_INCREMENT,\n FIRST_NAME \t\tVARCHAR(100) NOT NULL,\n LAST_NAME \t\tVARCHAR(50) NOT NULL, -- changed the size of lastname\n COMPANY_NAME \t\tVARCHAR(500),\n NATIVE_LANGUAGE \t\tVARCHAR(50),\n EMAIL \t\tVARCHAR(75) NOT NULL UNIQUE, -- changed length, added unique constraint\n PRIMARY_MOBILE \t\tVARCHAR(15) NOT NULL, -- changed length \t\n SECONDARY_MOBILE \t\tVARCHAR(15), -- changed length\n-- USER_NAME \t\tVARCHAR(50) NOT NULL UNIQUE,","CREATE TABLE USERS(\n USER_ID \t\tINT NOT NULL AUTO_INCREMENT,\n FIRST_NAME \t\tVARCHAR(100) NOT NULL,\n LAST_NAME \t\tVARCHAR(50) NOT NULL, -- changed the size of lastname\n COMPANY_NAME \t\tVARCHAR(500),\n NATIVE_LANGUAGE \t\tVARCHAR(50),\n EMAIL \t\tVARCHAR(75) NOT NULL UNIQUE, -- changed length, added unique constraint\n PRIMARY_MOBILE \t\tVARCHAR(15) NOT NULL, -- changed length \t\n SECONDARY_MOBILE \t\tVARCHAR(15), -- changed length\n-- USER_NAME \t\tVARCHAR(50) NOT NULL UNIQUE,\n-- PASSWORD \t\tVARCHAR(50) NOT NULL,\n USER_TYPE\t\t\t\tVARCHAR(50) NOT NULL, -- Added\n REMARKS \t\tTEXT,\n\tSTATUS\t\t\t\t\tVARCHAR(30) NOT NULL,\n MOBILE_VERIFIED_FLAG \tCHAR(1) NOT NULL DEFAULT 'N',\n ID_VERIFIED_FLAG"
6792,506480,7,"""AUTO_INCREMENT""","syntax error at or near ""AUTO_INCREMENT"", at index 71","AUTO_INCREMENT PRIMARY KEY,\n status\tVARCHAR(20) DEFAULT 'pending',\n\tname VARCHAR(200) DEFAULT 'none',\n user_id MEDIUMINT DEFAULT 1,\n street_address VARCHAR(200)\tDEFAULT 'none',\n city VARCHAR(200)\tDEFAULT 'none',\n state VARCHAR(50)\tDEFAULT 'none',\n zip VARCHAR(20)\tDEFAULT 'none',\n attention VARCHAR(200)\tDEFAULT 'none',\n active\tVARCHAR(20) DEFAULT 'true',\n dt TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,\n\n FOREIGN KEY (user_id) REFERENCES u","CREATE TABLE IF NOT EXISTS invoice(\n invoice_id\tMEDIUMINT \tNOT NULL AUTO_INCREMENT PRIMARY KEY,\n status\tVARCHAR(20) DEFAULT 'pending',\n\tname VARCHAR(200) DEFAULT 'none',\n user_id MEDIUMINT DEFAULT 1,\n street_address VARCHAR(200)\tDEFAULT 'none',\n city VARCHAR(200)\tDEFAULT 'none',\n state VARCHAR(50)\tDEFAULT 'none',\n zip VARCHAR(20)\tDEFAULT 'none',\n attention VARCHAR(200)\tDEFAULT 'none',\n active\tVARCHAR(20) DEFAULT 'true',\n dt TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,\n\n FOREIGN KEY (user_id) REFERENCES user(user_id)\n)ENGINE = InnoDB;"
26,503960,50,"""AUTO_INCREMENT""","syntax error at or near ""AUTO_INCREMENT"", at index 46","AUTO_INCREMENT PRIMARY KEY,\n\tpatient_fk INTEGER,\n\tsps_id\t\t\tVARCHAR(250) BINARY NOT NULL,\n\tstart_datetime\tDATETIME NOT NULL,\n\tstation_aet VARCHAR(250) BINARY NOT NULL,\n\tmodality\t VARCHAR(250) BINARY NOT NULL,\n\tperf_physician VARCHAR(250),\n\treq_proc_id \tVARCHAR(250) BINARY NOT NULL,\n\taccession_no\tVARCHAR(250) BINARY,\n\titem_attrs\t\tBLOB,\nFOREIGN KEY (patient_fk) REFERENCES patient(pk)\n);","CREATE TABLE mwl_item (\n\tpk \tINTEGER AUTO_INCREMENT PRIMARY KEY,\n\tpatient_fk INTEGER,\n\tsps_id\t\t\tVARCHAR(250) BINARY NOT NULL,\n\tstart_datetime\tDATETIME NOT NULL,\n\tstation_aet VARCHAR(250) BINARY NOT NULL,\n\tmodality\t VARCHAR(250) BINARY NOT NULL,\n\tperf_physician VARCHAR(250),\n\treq_proc_id \tVARCHAR(250) BINARY NOT NULL,\n\taccession_no\tVARCHAR(250) BINARY,\n\titem_attrs\t\tBLOB,\nFOREIGN KEY (patient_fk) REFERENCES patient(pk)\n);"
14260,508732,7,"""AUTO_INCREMENT""","syntax error at or near ""AUTO_INCREMENT"", at index 48","AUTO_INCREMENT,\n role_name VARCHAR(255)\n);","CREATE TABLE role (\n role_id INT PRIMARY KEY AUTO_INCREMENT,\n role_name VARCHAR(255)\n);"


### @

In [64]:
no_autoincr = no_parant[
    ~(no_parant.trunc_err.str.contains("AUTO_INCREMENT",case=False,regex=False))
]
arond_check = no_autoincr[
    (no_autoincr.trunc_err == '"@"')
]
print(get_stmt_and_file_count(arond_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.33 ss
[########################################] | 100% Completed | 5.51 ss
(85724, 11244)


In [62]:
arond_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 120.69 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
15360,509022,351,"""@""","syntax error at or near ""@"", at index 4",@saved_cs_client = @@character_set_client;,SET @saved_cs_client = @@character_set_client;
14075,508659,33,"""@""","syntax error at or near ""@"", at index 47","""@""localhost"";","/*making users*/\nDROP USER IF EXISTS ""readonly""@""localhost"";"
7578,506766,2,"""@""","syntax error at or near ""@"", at index 4","@OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';","SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';"
1808,504759,129,"""@""","syntax error at or near ""@"", at index 4",@id := (SELECT LAST_INSERT_ID());,SET @id := (SELECT LAST_INSERT_ID());
3748,505426,0,"""@""","syntax error at or near ""@"", at index 4","@OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;","SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;"


### USE

In [73]:
no_arond = no_autoincr[
    ~(no_autoincr.trunc_err == '"@"')
]
use_check = no_arond[
    (no_arond.trunc_err.str.contains('USE',case=False,regex=False))
]
print(get_stmt_and_file_count(use_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.30 ss
[########################################] | 100% Completed | 6.13 ss
(124250, 78847)


In [66]:
use_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 108.12 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
13826,508522,81,"""USE""","syntax error at or near ""USE"", at index 0",;,USE `android2k17`;
12510,507908,1,"""USE""","syntax error at or near ""USE"", at index 0",;,USE Hotel;
13919,508572,1,"""USE""","syntax error at or near ""USE"", at index 0",;,USE `desafio_3`;
15690,509150,1503,"""USE""","syntax error at or near ""USE"", at index 0",;,USE CURSORES;
17672,509761,1,"""USE""","syntax error at or near ""USE"", at index 0",;,USE `finalyearproject`;


### DELIMITER

In [74]:
no_use = no_arond[
    ~(no_arond.trunc_err.str.contains('USE',case=False,regex=False))
]
delimiter_check = no_use[
    (no_use.trunc_err == '"DELIMITER"')
]
print(get_stmt_and_file_count(delimiter_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.29 ss
[########################################] | 100% Completed | 8.07 ss
(73288, 8597)


In [69]:
delimiter_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 107.57 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
8805,507198,260,"""DELIMITER""","syntax error at or near ""DELIMITER"", at index 0",;,DELIMITER ;
13199,508137,257,"""DELIMITER""","syntax error at or near ""DELIMITER"", at index 0",;,DELIMITER ;
6340,506361,158,"""DELIMITER""","syntax error at or near ""DELIMITER"", at index 0",;,"DELIMITER $$\n\n/*!50003 CREATE DEFINER=`root`@`localhost` PROCEDURE `table_reservation_detail_update_table_by_idTable_idReservation`(IN idReservation INT,IN idTable INT, IN idTableChange INT)\nBEGIN\n\tUPDATE table_reservation_detail\n\tSET table_reservation_detail.id_table_id=idTableChange\n\tWHERE table_reservation_detail.id_table_reservation_id=idReservation AND table_reservation_detail.id_table_id=idTable;\nEND */$$\nDELIMITER ;"
6954,506520,42,"""DELIMITER""","syntax error at or near ""DELIMITER"", at index 31",\nDELIMITER $$\nCREATE TRIGGER `order_user_BEFORE_INSERT` BEFORE INSERT ON `order_user` FOR EACH ROW BEGIN\n\tIF ((NEW.mode_pay != 'Cash on Delivery') AND (NEW.mode_pay != 'Online Banking')) THEN\n\t\tSIGNAL SQLSTATE '45000' SET MESSAGE_TEXT='Invalid Mode of Payment!';\n\tEND IF;\nEND\n$$\nDELIMITER ;,--\n-- Triggers `order_user`\n--\nDELIMITER $$\nCREATE TRIGGER `order_user_BEFORE_INSERT` BEFORE INSERT ON `order_user` FOR EACH ROW BEGIN\n\tIF ((NEW.mode_pay != 'Cash on Delivery') AND (NEW.mode_pay != 'Online Banking')) THEN\n\t\tSIGNAL SQLSTATE '45000' SET MESSAGE_TEXT='Invalid Mode of Payment!';\n\tEND IF;\nEND\n$$\nDELIMITER ;
14252,508723,8,"""DELIMITER""","syntax error at or near ""DELIMITER"", at index 30","\nDELIMITER $$\n--\n-- Procedures\n--\nCREATE DEFINER=`root`@`localhost` PROCEDURE `proc1` (IN `pid` INT(10)) SELECT P.*,S.songName,SI.singerName,M.movieName,S.language,S.genre FROM playlist P,songs S,singers SI,movies M WHERE playlistID=pid AND S.songId=P.songID AND S.movieID=M.movieID AND S.singerID=SI.singerID$$\n\nDELIMITER ;","--\n-- Database: `musicdb`\n--\n\nDELIMITER $$\n--\n-- Procedures\n--\nCREATE DEFINER=`root`@`localhost` PROCEDURE `proc1` (IN `pid` INT(10)) SELECT P.*,S.songName,SI.singerName,M.movieName,S.language,S.genre FROM playlist P,songs S,singers SI,movies M WHERE playlistID=pid AND S.songId=P.songID AND S.movieID=M.movieID AND S.singerID=SI.singerID$$\n\nDELIMITER ;"


### GO

In [75]:
no_delimiter = no_use[
    ~(no_use.trunc_err == '"DELIMITER"')
]
go_check = no_delimiter[
    (no_delimiter.trunc_err == '"GO"')
]
print(get_stmt_and_file_count(go_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.89 ss
[########################################] | 100% Completed | 7.18 ss
(67834, 3388)


In [70]:
go_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 116.43 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
8512,507151,3,"""GO""","syntax error at or near ""GO"", at index 0",;,"GO\n\n-- --------------------------------------------------\n-- Dropping existing FOREIGN KEY constraints\n-- --------------------------------------------------\n\nIF OBJECT_ID(N'[dbo].[FK_CategoryGAME_CATEGORY]', 'F') IS NOT NULL\n ALTER TABLE [dbo].[GAME_CATEGORY] DROP CONSTRAINT [FK_CategoryGAME_CATEGORY];"
14968,508867,2,"""GO""","syntax error at or near ""GO"", at index 0",;,"GO\n------------------------------Script to create tables---------------------------------------\n\n\n------------------------------Script to create aiport table----------------------------------\nCREATE TABLE [dbo].[Airport](\n\t[Id] [int] IDENTITY(1,1) NOT NULL,\n\t[Name] [varchar](56) NULL,\n\t[Code] [varchar](3) NULL,\n\t[StateCode] [varchar](2) NULL,\n\t[CountryCode] [varchar](2) NULL,\n\t[CountryName] [varchar](32) NULL,\n CONSTRAINT [PK_AIRPORT] PRIMARY KEY CLUSTERED \n(\n\t[Id] ASC\n)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]\n) ON [PRIMARY]\nGO\n\n\n------------------------------Script to create transportation table---------------------------------\nCREATE TABLE [dbo].[Transportation]("
4686,505796,82,"""GO""","syntax error at or near ""GO"", at index 0",;,GO\n\n-- Creating foreign key on [TrainingCourseId] in table 'CourseBookings'\nALTER TABLE [dbo].[CourseBookings]\nADD CONSTRAINT [FK_TrainingCourseCourseBooking]\n FOREIGN KEY ([TrainingCourseId])\n REFERENCES [dbo].[TrainingCourses]\n ([Id])\n ON DELETE CASCADE ON UPDATE NO ACTION;
4620,505796,16,"""GO""","syntax error at or near ""GO"", at index 0",;,"GO\nIF OBJECT_ID(N'[dbo].[FK_AspNetUserTrainningCourseCoach]', 'F') IS NOT NULL\n ALTER TABLE [dbo].[TrainningCourseCoaches] DROP CONSTRAINT [FK_AspNetUserTrainningCourseCoach];"
4661,505796,57,"""GO""","syntax error at or near ""GO"", at index 0",;,GO\n\n-- Creating non-clustered index for FOREIGN KEY 'FK_TrainingCourseTimetableTrainningCourseCoach'\nCREATE INDEX [IX_FK_TrainingCourseTimetableTrainningCourseCoach]\nON [dbo].[TrainningCourseCoaches]\n ([TrainingCourseTimetableId]);


### [

In [76]:
no_go = no_delimiter[
    ~(no_delimiter.trunc_err == '"GO"')
]
tsql_check = no_go[
    (no_go.trunc_err == '"["')
]
print(get_stmt_and_file_count(tsql_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 9.12 ss
[########################################] | 100% Completed | 9.36 ss
(56019, 20900)


In [72]:
tsql_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 121.42 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
17934,509823,45,"""[""","syntax error at or near ""["", at index 12",[Isik] ADD CONSTRAINT [AK_Isik_e_meil] UNIQUE ([e_meil])\n;,ALTER TABLE [Isik] ADD CONSTRAINT [AK_Isik_e_meil] UNIQUE ([e_meil])\n;
17941,509823,52,"""[""","syntax error at or near ""["", at index 12",[Laua_kategooria_tyyp] ADD CONSTRAINT [PK_Laua_kategooria_tyyp]\n\tPRIMARY KEY ([laua_kategooria_tyyp_kood])\n;,ALTER TABLE [Laua_kategooria_tyyp] ADD CONSTRAINT [PK_Laua_kategooria_tyyp]\n\tPRIMARY KEY ([laua_kategooria_tyyp_kood])\n;
13724,508482,0,"""[""","syntax error at or near ""["", at index 13","[dbo].[AnnouncementDocument] (\n [DocId] INT IDENTITY (1, 1) NOT NULL,\n [AnncId] INT NOT NULL,\n [Name] NVARCHAR (150) NOT NULL,\n [Path] NVARCHAR (350) NULL,\n [Size] FLOAT (53) NULL,\n CONSTRAINT [PK_AnnouncementDocument] PRIMARY KEY CLUSTERED ([DocId] ASC),\n CONSTRAINT [FK_AnnouncementDocument_Announcement] FOREIGN KEY ([AnncId]) REFERENCES [dbo].[Announcement] ([AnncId])\n);","CREATE TABLE [dbo].[AnnouncementDocument] (\n [DocId] INT IDENTITY (1, 1) NOT NULL,\n [AnncId] INT NOT NULL,\n [Name] NVARCHAR (150) NOT NULL,\n [Path] NVARCHAR (350) NULL,\n [Size] FLOAT (53) NULL,\n CONSTRAINT [PK_AnnouncementDocument] PRIMARY KEY CLUSTERED ([DocId] ASC),\n CONSTRAINT [FK_AnnouncementDocument_Announcement] FOREIGN KEY ([AnncId]) REFERENCES [dbo].[Announcement] ([AnncId])\n);"
16507,509338,0,"""[""","syntax error at or near ""["", at index 13","[relational].[Reviews]\n(\n\t--This is a surrogate key\n\treview_id INT NOT NULL PRIMARY KEY IDENTITY,\n\treview_date DATE NULL,\n\t--This enforces a constraint which not always holds\n\tmovie_id VARCHAR(20) NOT NULL FOREIGN KEY REFERENCES [relational].[Movies](movie_id),\n\t[user_id] VARCHAR(20) NULL,\n\tis_spoiler BIT NULL,\n\treview_text NVARCHAR(MAX) NULL,\n\trating DECIMAL(3,1) NULL,\n\treview_summary NVARCHAR(MAX) NULL\n)","CREATE TABLE [relational].[Reviews]\n(\n\t--This is a surrogate key\n\treview_id INT NOT NULL PRIMARY KEY IDENTITY,\n\treview_date DATE NULL,\n\t--This enforces a constraint which not always holds\n\tmovie_id VARCHAR(20) NOT NULL FOREIGN KEY REFERENCES [relational].[Movies](movie_id),\n\t[user_id] VARCHAR(20) NULL,\n\tis_spoiler BIT NULL,\n\treview_text NVARCHAR(MAX) NULL,\n\trating DECIMAL(3,1) NULL,\n\treview_summary NVARCHAR(MAX) NULL\n)"
17944,509823,55,"""[""","syntax error at or near ""["", at index 12",[Riik] ADD CONSTRAINT [PK_Riik]\n\tPRIMARY KEY ([kood])\n;,ALTER TABLE [Riik] ADD CONSTRAINT [PK_Riik]\n\tPRIMARY KEY ([kood])\n;


### ENGINE

In [77]:
no_tsql = no_go[
    ~(no_go.trunc_err == '"["')
]
engine_check = no_tsql[
    (no_tsql.trunc_err.str.contains('engine',case=False,regex=False))
]
print(get_stmt_and_file_count(engine_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 10.47 s
[########################################] | 100% Completed | 8.33 ss
(48115, 6921)


In [78]:
engine_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 256.60 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
5691,506131,30,"""ENGINE""","syntax error at or near ""ENGINE"", at index 184",ENGINE=InnoDB CHARACTER SET='utf8';,"CREATE TABLE media_library_media_language\n(\n id CHAR(36) NOT NULL,\n name VARCHAR(120) NOT NULL,\n shortcut CHAR(3) NOT NULL,\n PRIMARY KEY (id),\n INDEX nameIndex (name)\n) ENGINE=InnoDB CHARACTER SET='utf8';"
2519,505004,12,"""ENGINE""","syntax error at or near ""ENGINE"", at index 197",ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE utf8_bin;,"create table ld_recipient (ld_messageid bigint not null, ld_name varchar(255) not null, ld_address varchar(255) not null, ld_mode varchar(255) not null, ld_type int not null, ld_read int not null) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE utf8_bin;"
5337,506072,100,"""ENGINE""","syntax error at or near ""ENGINE"", at index 258",)ENGINE INNODB;,"CREATE TABLE IF NOT EXISTS REG_CLUSTER_LOCK (\n REG_LOCK_NAME VARCHAR (20),\n REG_LOCK_STATUS VARCHAR (20),\n REG_LOCKED_TIME TIMESTAMP,\n REG_TENANT_ID INTEGER DEFAULT 0,\n PRIMARY KEY (REG_LOCK_NAME)\n)ENGINE INNODB;"
1102,504447,4,"""ENGINE""","syntax error at or near ""ENGINE"", at index 252",ENGINE=InnoDB DEFAULT CHARSET=latin1;,"CREATE TABLE lti_share_key (\n share_key_id varchar(32) NOT NULL,\n primary_consumer_key varchar(255) NOT NULL,\n primary_context_id varchar(255) NOT NULL,\n auto_approve tinyint(1) NOT NULL,\n expires datetime NOT NULL,\n PRIMARY KEY (share_key_id)\n) ENGINE=InnoDB DEFAULT CHARSET=latin1;"
6570,506385,10,"""engine""","syntax error at or near ""engine"", at index 130",\nengine=innodb;,"CREATE TABLE restaurante_forma_pagamento\n (\n restaurante_id BIGINT NOT NULL,\n forma_pagamento_id BIGINT NOT NULL\n )\nengine=innodb;"


### UNSIGNED

In [79]:
no_engine = no_tsql[
    ~(no_tsql.trunc_err.str.contains('engine',case=False,regex=False))
]
unsigned_check = no_engine[
    (no_engine.trunc_err.str.contains('signed',case=False,regex=False))
]
print(get_stmt_and_file_count(unsigned_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 7.99 ss
[########################################] | 100% Completed | 6.28 ss
(57602, 7312)


In [80]:
unsigned_check.head()

[########################################] | 100% Completed | 106.19 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
39,503965,2,"""UNSIGNED""","syntax error at or near ""UNSIGNED"", at index 47","UNSIGNED NOT NULL AUTO_INCREMENT,\n LOGINNAME VARCHAR(128) not null,\n PASSWORD VARCHAR(128) not null,\n ENABLED CHAR(1) default 'Y',\n LAST_UPDATE_DATE DATETIME,\n LAST_UPDATED_BY VARCHAR(128),\n CREATION_DATE DATETIME,\n CREATED_BY VARCHAR(128),\n PRIMARY KEY (ID)\n);","create table P_USERS\n(\n ID INT UNSIGNED NOT NULL AUTO_INCREMENT,\n LOGINNAME VARCHAR(128) not null,\n PASSWORD VARCHAR(128) not null,\n ENABLED CHAR(1) default 'Y',\n LAST_UPDATE_DATE DATETIME,\n LAST_UPDATED_BY VARCHAR(128),\n CREATION_DATE DATETIME,\n CREATED_BY VARCHAR(128),\n PRIMARY KEY (ID)\n);"
40,503965,3,"""UNSIGNED""","syntax error at or near ""UNSIGNED"", at index 51","UNSIGNED NOT NULL AUTO_INCREMENT,\n IDENTIFICATION VARCHAR(1024) NOT NULL COMMENT '受保护的资源定义,例如URL,业务层方法等。 ',\n R_TYPE \t\tCHAR(32)\t NOT NULL COMMENT 'URL : URL层资源， METHOD : 方法层资源， DAO ： DAO层资源 ',\n ENABLE \tCHAR(1)\t DEFAULT 'Y'\t COMMENT 'Y ： 资源有效， N ： 资源无效 ', \n DESCRIPTION VARCHAR(1000)\t\t\t COMMENT '备注信息 ', \n PRIMARY KEY (ID)\n);","CREATE TABLE P_RESOURCES\n(\n ID INT UNSIGNED NOT NULL AUTO_INCREMENT,\n IDENTIFICATION VARCHAR(1024) NOT NULL COMMENT '受保护的资源定义,例如URL,业务层方法等。 ',\n R_TYPE \t\tCHAR(32)\t NOT NULL COMMENT 'URL : URL层资源， METHOD : 方法层资源， DAO ： DAO层资源 ',\n ENABLE \tCHAR(1)\t DEFAULT 'Y'\t COMMENT 'Y ： 资源有效， N ： 资源无效 ', \n DESCRIPTION VARCHAR(1000)\t\t\t COMMENT '备注信息 ', \n PRIMARY KEY (ID)\n);"
41,503965,4,"""UNSIGNED""","syntax error at or near ""UNSIGNED"", at index 55","UNSIGNED NOT NULL AUTO_INCREMENT,\n NAME VARCHAR(128)\t\t\t COMMENT '受保护的资源相关的属性 ', \n A_TYPE \t\tCHAR(32)\t NOT NULL\t DEFAULT 'ROLE_' COMMENT 'ROLE : 角色类属性 ',\n ENABLE \tCHAR(1)\t DEFAULT 'Y'\t COMMENT 'Y ： 资源有效， N ： 资源无效 ', \n MEMO \t\tVARCHAR(2000)\t\t\t COMMENT '备注信息 ', \n PRIMARY KEY (ID)\n);","CREATE TABLE P_RES_ATTRIBUTE\n(\n ID INT UNSIGNED NOT NULL AUTO_INCREMENT,\n NAME VARCHAR(128)\t\t\t COMMENT '受保护的资源相关的属性 ', \n A_TYPE \t\tCHAR(32)\t NOT NULL\t DEFAULT 'ROLE_' COMMENT 'ROLE : 角色类属性 ',\n ENABLE \tCHAR(1)\t DEFAULT 'Y'\t COMMENT 'Y ： 资源有效， N ： 资源无效 ', \n MEMO \t\tVARCHAR(2000)\t\t\t COMMENT '备注信息 ', \n PRIMARY KEY (ID)\n);"
42,503965,5,"""UNSIGNED""","syntax error at or near ""UNSIGNED"", at index 54","UNSIGNED NOT NULL AUTO_INCREMENT,\n ATTRIBUTE_ID \tINT NOT NULL,\n RESOURCE_ID INT NOT NULL,\n PRIMARY KEY (ID),\n UNIQUE KEY RES_ATTR_MAP_U1 (RESOURCE_ID, ATTRIBUTE_ID)\n);","CREATE TABLE P_RES_ATTR_MAP\n(\n ID INT UNSIGNED NOT NULL AUTO_INCREMENT,\n ATTRIBUTE_ID \tINT NOT NULL,\n RESOURCE_ID INT NOT NULL,\n PRIMARY KEY (ID),\n UNIQUE KEY RES_ATTR_MAP_U1 (RESOURCE_ID, ATTRIBUTE_ID)\n);"
43,503965,6,"""UNSIGNED""","syntax error at or near ""UNSIGNED"", at index 57","UNSIGNED NOT NULL AUTO_INCREMENT,\n USER_ID \tINT not null,\n RES_ATTRIBUTE_ID INT not null,\n PRIMARY KEY (ID)\n);","create table P_USER_PERMISSION\n(\n ID INT UNSIGNED NOT NULL AUTO_INCREMENT,\n USER_ID \tINT not null,\n RES_ATTRIBUTE_ID INT not null,\n PRIMARY KEY (ID)\n);"


### Doing err counts again

In [81]:
no_sign = no_engine[
    ~(no_engine.trunc_err.str.contains('signed',case=False,regex=False))
]
unsigned_check = no_engine[
    (no_engine.trunc_err.str.contains('signed',case=False,regex=False))
]
print(get_stmt_and_file_count(no_sign))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.99 ss
[########################################] | 100% Completed | 6.82 ss
(1145279, 131307)


In [82]:
errcounts = no_sign['trunc_err'].value_counts().compute(scheduler='processes')
err_df = pd.DataFrame({'err':errcounts.index, 'cnt':errcounts.values})

[########################################] | 100% Completed | 7.72 ss


### ORACLE STUFF

In [85]:
ora_check = no_sign[
    (no_sign.trunc_err.str.contains('initrans',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('pctfree',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('maxtrans',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('logging',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('maxtrans',case=False,regex=False))
]
print(get_stmt_and_file_count(ora_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 7.23 ss
[########################################] | 100% Completed | 7.25 ss
(20548, 2333)


In [86]:
ora_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 110.70 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
11417,507469,2,"""pctfree""","syntax error at or near ""pctfree"", at index 246",pctfree 10\n initrans 1\n maxtrans 255\n storage\n (\n initial 2M\n next 2M\n minextents 1\n maxextents unlimited\n )\nnologging;,"-- Create table\ncreate table MODEL_CALCULATES\n(\n id_calc NUMBER not null,\n calc_type CHAR(1) not null,\n pnpt_id NUMBER(12) not null,\n rfpm_id VARCHAR2(8) not null,\n summ_all NUMBER(19,2) not null,\n date_stop DATE\n)\ntablespace DATA\n pctfree 10\n initrans 1\n maxtrans 255\n storage\n (\n initial 2M\n next 2M\n minextents 1\n maxextents unlimited\n )\nnologging;"
16958,509523,10,"""nologging""","syntax error at or near ""nologging"", at index 21",nologging;,alter index PID_ATTR nologging;
7292,506625,112,"""INITRANS""","syntax error at or near ""INITRANS"", at index 118",INITRANS 1 MAXTRANS 255 NOCACHE;,"-- R_ALARM_MAIL\nCREATE TABLE ""R_ALARM_MAIL""(\n\t""ID_MAIL"" Number(19,0) NOT NULL, \n\t""ID_ALARM"" Varchar2(100) NOT NULL \n) INITRANS 1 MAXTRANS 255 NOCACHE;"
17176,509569,375,"""PCTFREE""","syntax error at or near ""PCTFREE"", at index 244","PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS \n STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645\n PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)\n TABLESPACE ""SYSTEM"" ENABLE;","--------------------------------------------------------\n-- Constraints for Table COMPANY_ONE2ONE\n--------------------------------------------------------\n\n ALTER TABLE ""USERID"".""COMPANY_ONE2ONE"" ADD PRIMARY KEY (""ONE2ONE_IDX"")\n USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS \n STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645\n PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)\n TABLESPACE ""SYSTEM"" ENABLE;"
18548,510062,174,"""PCTFREE""","syntax error at or near ""PCTFREE"", at index 223","PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS \n STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645\n PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)\n TABLESPACE ""SYSTEM"" ENABLE;","--------------------------------------------------------\n-- Constraints for Table COMMENTS\n--------------------------------------------------------\n\n ALTER TABLE ""JBLOG"".""COMMENTS"" ADD PRIMARY KEY (""CMTNO"")\n USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS \n STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645\n PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)\n TABLESPACE ""SYSTEM"" ENABLE;"


### MODIFY

In [90]:
no_ora = no_sign[
    ~((no_sign.trunc_err.str.contains('initrans',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('pctfree',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('maxtrans',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('logging',case=False,regex=False)) |
    (no_sign.trunc_err.str.contains('maxtrans',case=False,regex=False)))
]
modify_check = no_ora[
    (no_ora.trunc_err == '"MODIFY"') &
    (no_ora.original.str.contains('ALTER TABLE',case=False,regex=False))
]
print(get_stmt_and_file_count(modify_check))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 8.55 ss
[########################################] | 100% Completed | 8.14 ss
(37184, 3383)


In [89]:
modify_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 107.05 ms


Unnamed: 0,file_id,statement_nr,trunc_err,parse_error,original_highlight,original
6911,506497,26,"""MODIFY""","syntax error at or near ""MODIFY"", at index 24",MODIFY `order_id` int(11) NOT NULL;,ALTER TABLE order_items MODIFY `order_id` int(11) NOT NULL;
6142,506269,220,"""MODIFY""","syntax error at or near ""MODIFY"", at index 42",MODIFY HSM_SEQ_NUM varchar2(4);,ALTER TABLE RCRA_HD_SEC_MATERIAL_ACTIVITY MODIFY HSM_SEQ_NUM varchar2(4);
17182,509569,381,"""MODIFY""","syntax error at or near ""MODIFY"", at index 36","MODIFY (""SALENOL"" NOT NULL ENABLE);","ALTER TABLE ""USERID"".""MONEY_TBL_02"" MODIFY (""SALENOL"" NOT NULL ENABLE);"
13962,508610,13,"""MODIFY""","syntax error at or near ""MODIFY"", at index 34","MODIFY (""ID_PRESSERV"" NOT NULL ENABLE);","ALTER TABLE ""SYSTEM"".""PREST_SERV"" MODIFY (""ID_PRESSERV"" NOT NULL ENABLE);"
1761,504754,3,"""MODIFY""","syntax error at or near ""MODIFY"", at index 32","MODIFY (""SUPPLIER_WEIGHT"" NOT NULL ENABLE);","ALTER TABLE ""SUPPLIER_IN_STOCK"" MODIFY (""SUPPLIER_WEIGHT"" NOT NULL ENABLE);"


### no modify, stop

In [91]:
no_modify = no_ora[
    ~((no_ora.trunc_err == '"MODIFY"') &
    (no_ora.original.str.contains('ALTER TABLE',case=False,regex=False)))
]

print(get_stmt_and_file_count(no_modify))
#auto_incr_check.sample(frac=0.6).head()

[########################################] | 100% Completed | 6.93 ss
[########################################] | 100% Completed | 6.94 ss
(1087547, 129901)


### TBD

In [84]:
err_df.head(100)

Unnamed: 0,err,cnt
0,"""MODIFY""",37240
1,"""$""",27632
2,"""`#""",25789
3,"""NOT""",25648
4,"""REPLACE""",25175
5,"""###""",24371
6,"""=@""",22106
7,"""IF""",21453
8,"""IDENTITY""",21429
9,"""IGNORE""",20481
