# Parsing WikiSQL

In this notebook we focus at:
* Exploring WikiSQL
* Loading it into a sqlite3 db
* Checking if the query results can be parsed it into single table, single row ToTTo expected format

In [63]:
# Imports
import sqlite3
import pandas as pd
import json
import re

# Constants
WIKISQL_DB_PATH = "../storage/datasets/wiki_sql/raw/train.db"
WIKISQL_JSON_PATH = "../storage/datasets/wiki_sql/raw/train.tables.jsonl"
QUERIES_PATH = "../storage/datasets/wiki_sql/raw/train.jsonl"

## Reading WikiSQL

In [10]:
def run_query_on_connection():
    pass

def connect_and_query(db_path, query):
    con = sqlite3.connect(db_path)
    cur = con.cursor()

    cur.execute(query)
    res = cur.fetchall()
    con.close()
    
    return res

In [12]:
# Check the table names in the database
print(connect_and_query(WIKISQL_DB_PATH, "SELECT name FROM sqlite_master WHERE type='table';")[:5])

[('table_1_1000181_1',), ('table_1_10006830_1',), ('table_1_10007452_3',), ('table_1_10015132_1',), ('table_1_10015132_14',)]


As we can see the table names have been given a special id. Unfortunately, this leads to lost information that could be used in explaining the query.

In [22]:
def concat_col_names_and_res(res, col_names):
    concatenated = [col_names]
    for row in res:
        concatenated.append(row)
    
    return concatenated

pd.DataFrame(concat_col_names_and_res(*query_with_col_names(WIKISQL_DB_PATH, "SELECT * FROM table_1_10007452_3;")))

Unnamed: 0,0,1,2,3,4,5
0,col0,col1,col2,col3,col4,col5
1,1992-93,gillig,phantom (high floor),444-464 (21),dd s50egr allison wb-400r,diesel
2,1996,gillig,phantom (high floor),465-467 (3),dd s50 allison wb-400r,diesel
3,1998,gillig,phantom (high floor),468-473 (6),dd s50 allison wb-400r,diesel
4,2000,gillig,advantage (low floor),474-481 (8),cummins isc allison wb-400r,diesel
5,2002,gillig,advantage (low floor),482-492 (11),cummins isl allison wb-400r,diesel
6,2010,nfi,ge40lfr,300-309 (10),ford triton v10 ise-thundervolt tb40-hg,hybrid
7,2011,nfi,c40lfr,310-329 (20),cummins westport isl-g allison wb-400r,cng


In [57]:
class wikiqsl_controller:
    def __init__(self, db_path, tables_path):
        self.db_path = db_path
        
        self.table_info = []
        with open(tables_path) as file_in:
            for line in file_in:
                self.table_info.append(json.loads(line))

    def connect_and_query(self, query):
        con = sqlite3.connect(self.db_path)
        cur = con.cursor()

        cur.execute(query)
        res = cur.fetchall()
        con.close()

        return res
    
    def find_json_table(self, table_id):
        for table in self.table_info:
            if table['id'] == table_id:
                return table
            
    def find_col_names(self, table_id):
        return self.find_json_table(table_id)['header']
    
    def query_with_title(self, query):
        try:
            extracted_table_id = re.search('FROM\stable_(\w*)[\s|;]', query).group(1)
            extracted_table_id = extracted_table_id.replace("_", "-")
        except AttributeError as e:
            raise AttributeError(f"TableId could not be extracted from {query}")
        
        res = self.connect_and_query(query)
        col_names = self.find_col_names(extracted_table_id)
        res.insert(0, tuple(col_names))
        
        return res


In [59]:
wikisql = wikiqsl_controller(WIKISQL_DB_PATH, WIKISQL_JSON_PATH)
pd.DataFrame(wikisql.query_with_title("SELECT * FROM table_1_10007452_3;"))

Unnamed: 0,0,1,2,3,4,5
0,Order Year,Manufacturer,Model,Fleet Series (Quantity),Powertrain (Engine/Transmission),Fuel Propulsion
1,1992-93,gillig,phantom (high floor),444-464 (21),dd s50egr allison wb-400r,diesel
2,1996,gillig,phantom (high floor),465-467 (3),dd s50 allison wb-400r,diesel
3,1998,gillig,phantom (high floor),468-473 (6),dd s50 allison wb-400r,diesel
4,2000,gillig,advantage (low floor),474-481 (8),cummins isc allison wb-400r,diesel
5,2002,gillig,advantage (low floor),482-492 (11),cummins isl allison wb-400r,diesel
6,2010,nfi,ge40lfr,300-309 (10),ford triton v10 ise-thundervolt tb40-hg,hybrid
7,2011,nfi,c40lfr,310-329 (20),cummins westport isl-g allison wb-400r,cng


## Parsing Queries

In [94]:
def sql_query_creator(table_id, sel_index, agg_index, conditions):
    agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']
    cond_ops = ['=', '>', '<', 'OP']
    
    table_id = f"table_{table_id.replace('-', '_')}"
    
    rep = f"SELECT {agg_ops[agg_index]}(col{sel_index}) " if agg_index != 0 else f"SELECT col{sel_index} "
    rep += f"FROM {table_id} "

    if conditions:
        rep +=  'WHERE ' + ' AND '.join(['{} {} "{}"'.format('col{}'.format(i), cond_ops[o], v) for i, o, v in conditions])
    return rep

In [64]:
queries = []
with open(QUERIES_PATH) as file_in:
    for line in file_in:
        queries.append(json.loads(line))

In [99]:
json_query = queries[4]
query = sql_query_creator(json_query['table_id'], json_query['sql']['sel'], json_query['sql']['agg'], json_query['sql']['conds'])
print(query, wikisql.query_with_title(query))

SELECT COUNT(col3) FROM table_1_10007452_3 WHERE col5 = "CNG" [('Order Year', 'Manufacturer', 'Model', 'Fleet Series (Quantity)', 'Powertrain (Engine/Transmission)', 'Fuel Propulsion'), (0,)]


## Annotator

In [None]:
class Annotator:
    def __init__(self, db_path, tables_path, queries_path):
        self.db_path = db_path