In [None]:
# KaggleDBQA Dataset Analysis

import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the current directory to the Python path
#sys.path.append(os.getcwd())

# Import our processor
from process_kaggle_dbqa import KaggleDBQAProcessor

# Initialize the processor
print("Initializing processor...")
processor = KaggleDBQAProcessor()#(base_path="/mnt/c/Users/david/bodo/KaggleDBQA")

# Get available datasets
print("\nGetting available datasets...")
available_datasets = processor.get_available_datasets()
print("Available datasets:", available_datasets)

# Load and explore a specific dataset
print("\nLoading WorldSoccerDataBase dataset...")
dataset_name = "WorldSoccerDataBase"
df = processor.load_dataset(dataset_name)


# Analyze query patterns
df['query_length'] = df['query'].str.len()

# Compare different splits
print("\nComparing dataset splits...")
main_df = processor.load_dataset(dataset_name)
test_df = processor.load_dataset(dataset_name, split="test")
fewshot_df = processor.load_dataset(dataset_name, split="fewshot")

all_datasets = processor.load_all_datasets()

# Create summary
summary_data = []

for name, df in all_datasets.items():
    summary_data.append({
        'Dataset': name,
        'Number of Queries': len(df),
        'Average Query Length': df['query'].str.len().mean(),
        'Unique DB IDs': len(df['db_id'].unique())
    })

summary_df = pd.DataFrame(summary_data)
print("\nDataset Summary:")
print(summary_df) 

Initializing processor...

Starting dataset search...
Current working directory: c:\Users\david\bodo\labeling_agent\datasets\kaggle
Script directory: c:\Users\david\bodo\labeling_agent\datasets\kaggle

Searching in the following locations:
  c:\Users\david\bodo\labeling_agent\datasets\kaggle
    exists: True
    is_dir: True
    is_file: False
    contents: ['kaggle_dbqa_analysis.ipynb', 'kaggle_dbqa_analysis.py', 'kaggle_dbqa_combined.csv', 'process_kaggle_dbqa.py', 'query_length_distribution.png', '__pycache__']
  c:\Users\david\bodo\labeling_agent\datasets\kaggle\KaggleDBQA
    exists: False
  c:\Users\david\bodo\labeling_agent\datasets\KaggleDBQA
    exists: False
  c:\Users\david\bodo\labeling_agent\KaggleDBQA
    exists: False
  c:\Users\david\bodo\KaggleDBQA
    exists: True
    is_dir: True
    is_file: False
    contents: ['.git', 'DATASETS.md', 'examples', 'KaggleDBQA_tables.json', 'LICENSE.md', 'overview.png', 'README.md']
  C:\Users\david\KaggleDBQA
    exists: False
  c:\U

FileNotFoundError: Dataset file not found: c:\Users\david\bodo\KaggleDBQA\WorldSoccerDataBase.json

In [11]:
summary_df

Unnamed: 0,Dataset,Number of Queries,Average Query Length,Unique DB IDs
0,GeoNuclearData,32,89.03125,1
1,GreaterManchesterCrime,27,99.148148,1
2,Pesticide,50,76.44,1
3,StudentMathScore,28,141.285714,1
4,TheHistoryofBaseball,39,111.0,1
5,USWildFires,37,79.324324,1
6,WhatCDHipHop,41,92.804878,1
7,WorldSoccerDataBase,18,73.388889,1


In [6]:
# Display basic information
print(f"\nDataset: {dataset_name}")
print(f"Number of queries: {len(df)}")
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())


Dataset: WorldSoccerDataBase
Number of queries: 18

Columns: ['db_id', 'query', 'query_toks', 'question', 'question_toks', 'sql', 'source_file', 'split']

First few rows:
                 db_id                                              query  \
0  WorldSoccerDataBase  SELECT PSH FROM football_data WHERE HomeTeam L...   
1  WorldSoccerDataBase  SELECT * FROM football_data WHERE Country = "S...   
2  WorldSoccerDataBase  SELECT * FROM football_data WHERE Season = "20...   
3  WorldSoccerDataBase        SELECT * FROM football_data WHERE B365D > 3   
4  WorldSoccerDataBase  SELECT DRAW_CLOSING FROM betfront WHERE MATCH ...   

                                          query_toks  \
0  [SELECT, PSH, FROM, football_data, WHERE, Home...   
1  [SELECT, *, FROM, football_data, WHERE, Countr...   
2  [SELECT, *, FROM, football_data, WHERE, Season...   
3  [SELECT, *, FROM, football_data, WHERE, B365D,...   
4  [SELECT, DRAW_CLOSING, FROM, betfront, WHERE, ...   

                            

In [7]:
main_df# = processor.load_dataset(dataset_name)


Unnamed: 0,db_id,query,query_toks,question,question_toks,sql,source_file,split
0,WorldSoccerDataBase,SELECT PSH FROM football_data WHERE HomeTeam L...,"[SELECT, PSH, FROM, football_data, WHERE, Home...",What are the Pinnacle odds for Arsenal winning...,"[What, are, the, Pinnacle, odds, for, Arsenal,...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
1,WorldSoccerDataBase,"SELECT * FROM football_data WHERE Country = ""S...","[SELECT, *, FROM, football_data, WHERE, Countr...",Which matches in Spain did the away team win?,"[Which, matches, in, Spain, did, the, away, te...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
2,WorldSoccerDataBase,"SELECT * FROM football_data WHERE Season = ""20...","[SELECT, *, FROM, football_data, WHERE, Season...",Which Premier League matches ended in a draw i...,"[Which, Premier, League, matches, ended, in, a...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
3,WorldSoccerDataBase,SELECT * FROM football_data WHERE B365D > 3,"[SELECT, *, FROM, football_data, WHERE, B365D,...",Which matches had draw odds from Bet365 higher...,"[Which, matches, had, draw, odds, from, Bet365...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
4,WorldSoccerDataBase,SELECT DRAW_CLOSING FROM betfront WHERE MATCH ...,"[SELECT, DRAW_CLOSING, FROM, betfront, WHERE, ...",What were the closing odds for a draw in match...,"[What, were, the, closing, odds, for, a, draw,...","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase.json,main
5,WorldSoccerDataBase,SELECT * FROM football_data WHERE (FTHG + FTAG...,"[SELECT, *, FROM, football_data, WHERE, (FTHG,...",Which games had no goals scored at full time?,"[Which, games, had, no, goals, scored, at, ful...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
6,WorldSoccerDataBase,SELECT AwayTeam FROM football_data WHERE HomeT...,"[SELECT, AwayTeam, FROM, football_data, WHERE,...",What is the away team against Omiya Ardija in ...,"[What, is, the, away, team, against, Omiya, Ar...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
7,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE Seaso...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many matches in Spain in 2010?,"[How, many, matches, in, Spain, in, 2010?]","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase.json,main
8,WorldSoccerDataBase,SELECT MATCH FROM betfront ORDER BY DRAW_OPENI...,"[SELECT, MATCH, FROM, betfront, ORDER, BY, DRA...",Which matches has the highest draw opening so ...,"[Which, matches, has, the, highest, draw, open...","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase.json,main
9,WorldSoccerDataBase,SELECT YEAR FROM betfront GROUP BY YEAR ORDER ...,"[SELECT, YEAR, FROM, betfront, GROUP, BY, YEAR...",Which year has most matches?,"[Which, year, has, most, matches?]","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase.json,main


In [12]:
test_df# = processor.load_dataset(dataset_name, split="test")

Unnamed: 0,db_id,query,query_toks,question,question_toks,sql,source_file,split
0,WorldSoccerDataBase,SELECT AwayTeam FROM football_data WHERE HomeT...,"[SELECT, AwayTeam, FROM, football_data, WHERE,...",What is the away team against Omiya Ardija in ...,"[What, is, the, away, team, against, Omiya, Ar...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
1,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE Seaso...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many matches in Spain in 2010?,"[How, many, matches, in, Spain, in, 2010?]","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
2,WorldSoccerDataBase,SELECT MATCH FROM betfront ORDER BY DRAW_OPENI...,"[SELECT, MATCH, FROM, betfront, ORDER, BY, DRA...",Which matches has the highest draw opening so ...,"[Which, matches, has, the, highest, draw, open...","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase_test.json,test
3,WorldSoccerDataBase,SELECT YEAR FROM betfront GROUP BY YEAR ORDER ...,"[SELECT, YEAR, FROM, betfront, GROUP, BY, YEAR...",Which year has most matches?,"[Which, year, has, most, matches?]","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase_test.json,test
4,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE PSH !...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many matches did Pinnacle have betting rec...,"[How, many, matches, did, Pinnacle, have, bett...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
5,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE B365H...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many matches did Bet365 gives higher home ...,"[How, many, matches, did, Bet365, gives, highe...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
6,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE FTHG ...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many games that the total number of goals ...,"[How, many, games, that, the, total, number, o...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
7,WorldSoccerDataBase,SELECT max(B365A) FROM football_data,"[SELECT, max(B365A), FROM, football_data]",What is the highest home losing odds in Bet365...,"[What, is, the, highest, home, losing, odds, i...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
8,WorldSoccerDataBase,SELECT count(*) FROM football_data WHERE FTHG ...,"[SELECT, count(*), FROM, football_data, WHERE,...",How many number of games ended in a 0-0 tie?,"[How, many, number, of, games, ended, in, a, 0...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test
9,WorldSoccerDataBase,SELECT count(Div) FROM football_data,"[SELECT, count(Div), FROM, football_data]",How many league division does football_data da...,"[How, many, league, division, does, football_d...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_test.json,test


In [13]:
fewshot_df# = processor.load_dataset(dataset_name, split="fewshot")

Unnamed: 0,db_id,query,query_toks,question,question_toks,sql,source_file,split
0,WorldSoccerDataBase,SELECT PSH FROM football_data WHERE HomeTeam L...,"[SELECT, PSH, FROM, football_data, WHERE, Home...",What are the Pinnacle odds for Arsenal winning...,"[What, are, the, Pinnacle, odds, for, Arsenal,...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_fewshot.json,fewshot
1,WorldSoccerDataBase,"SELECT * FROM football_data WHERE Country = ""S...","[SELECT, *, FROM, football_data, WHERE, Countr...",Which matches in Spain did the away team win?,"[Which, matches, in, Spain, did, the, away, te...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_fewshot.json,fewshot
2,WorldSoccerDataBase,"SELECT * FROM football_data WHERE Season = ""20...","[SELECT, *, FROM, football_data, WHERE, Season...",Which Premier League matches ended in a draw i...,"[Which, Premier, League, matches, ended, in, a...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_fewshot.json,fewshot
3,WorldSoccerDataBase,SELECT * FROM football_data WHERE B365D > 3,"[SELECT, *, FROM, football_data, WHERE, B365D,...",Which matches had draw odds from Bet365 higher...,"[Which, matches, had, draw, odds, from, Bet365...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_fewshot.json,fewshot
4,WorldSoccerDataBase,SELECT DRAW_CLOSING FROM betfront WHERE MATCH ...,"[SELECT, DRAW_CLOSING, FROM, betfront, WHERE, ...",What were the closing odds for a draw in match...,"[What, were, the, closing, odds, for, a, draw,...","{'from': {'table_units': [['table_unit', 0]], ...",WorldSoccerDataBase_fewshot.json,fewshot
5,WorldSoccerDataBase,SELECT * FROM football_data WHERE (FTHG + FTAG...,"[SELECT, *, FROM, football_data, WHERE, (FTHG,...",Which games had no goals scored at full time?,"[Which, games, had, no, goals, scored, at, ful...","{'from': {'table_units': [['table_unit', 1]], ...",WorldSoccerDataBase_fewshot.json,fewshot


In [2]:
combined_df = processor.aggregate_all_datasets()

In [33]:
i = 509
combined_df["query"].iloc[i]


'SELECT * FROM football_data WHERE Country = "Spain" AND FTR = "A"'

In [34]:
combined_df.iloc[i]

db_id                                          WorldSoccerDataBase
query            SELECT * FROM football_data WHERE Country = "S...
query_toks       [SELECT, *, FROM, football_data, WHERE, Countr...
question             Which matches in Spain did the away team win?
question_toks    [Which, matches, in, Spain, did, the, away, te...
sql              {'from': {'table_units': [['table_unit', 1]], ...
source_file                               WorldSoccerDataBase.json
split                                                         main
dataset_name                                   WorldSoccerDataBase
query_id                                                       510
Name: 509, dtype: object

In [35]:
combined_df["question"].iloc[i]

'Which matches in Spain did the away team win?'

In [36]:
combined_df["sql"]

0      {'from': {'table_units': [['table_unit', 0]], ...
1      {'from': {'table_units': [['table_unit', 0]], ...
2      {'from': {'table_units': [['table_unit', 0]], ...
3      {'from': {'table_units': [['table_unit', 0]], ...
4      {'from': {'table_units': [['table_unit', 0]], ...
                             ...                        
539    {'from': {'table_units': [['table_unit', 1]], ...
540    {'from': {'table_units': [['table_unit', 1]], ...
541    {'from': {'table_units': [['table_unit', 1]], ...
542    {'from': {'table_units': [['table_unit', 0]], ...
543    {'from': {'table_units': [['table_unit', 1]], ...
Name: sql, Length: 544, dtype: object