<a href="https://colab.research.google.com/github/sharik31/SQL-Generator/blob/main/preprocessing_spider_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##  Convert train_spider.json to CSV with db_id, input, and output

In [1]:
import json
import csv

with open('/content/train_spider.json') as f:
    train_data = json.load(f)

with open('spider_train_with_dbid.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['db_id', 'input', 'output'])
    writer.writeheader()

    for item in train_data:
        db_id = item['db_id']
        question = item['question']
        sql = item['query']
        input_text = f"translate to SQL: {question}"
        writer.writerow({'db_id': db_id, 'input': input_text, 'output': sql})

print("Saved spider_train_with_dbid.csv with columns: db_id, input, output")


Saved spider_train_with_dbid.csv with columns: db_id, input, output


In [4]:
import pandas as pd
df=pd.read_csv('/content/spider_train_with_dbid.csv')

In [5]:
df.head()

Unnamed: 0,db_id,input,output
0,department_management,translate to SQL: How many heads of the depart...,SELECT count(*) FROM head WHERE age > 56
1,department_management,"translate to SQL: List the name, born state an...","SELECT name , born_state , age FROM head ORD..."
2,department_management,"translate to SQL: List the creation year, name...","SELECT creation , name , budget_in_billions ..."
3,department_management,translate to SQL: What are the maximum and min...,"SELECT max(budget_in_billions) , min(budget_i..."
4,department_management,translate to SQL: What is the average number o...,SELECT avg(num_employees) FROM department WHER...


In [7]:
df.head()

Unnamed: 0,db_id,input,output
0,department_management,translate to SQL: Table department (Department...,SELECT count(*) FROM head WHERE age > 56
1,department_management,translate to SQL: Table department (Department...,"SELECT name , born_state , age FROM head ORD..."
2,department_management,translate to SQL: Table department (Department...,"SELECT creation , name , budget_in_billions ..."
3,department_management,translate to SQL: Table department (Department...,"SELECT max(budget_in_billions) , min(budget_i..."
4,department_management,translate to SQL: Table department (Department...,SELECT avg(num_employees) FROM department WHER...


## Add Corresponding Table Schema from table.json as Separate Column

In [8]:
import json
import pandas as pd

df = pd.read_csv('spider_train_with_dbid.csv')

with open('tables.json') as f:
    schemas = json.load(f)

schema_map = {schema['db_id']: schema for schema in schemas}

def serialize_schema(schema):
    table_names = schema.get('table_names_original', schema.get('table_names', []))
    columns = schema.get('column_names_original', schema.get('column_names', []))
    tables = {i: [] for i in range(len(table_names))}
    for col in columns:
        idx, col_name = col
        if idx != -1 and idx < len(table_names):
            tables[idx].append(col_name)
    serialized = []
    for idx, table_name in enumerate(table_names):
        cols = ", ".join(tables[idx])
        serialized.append(f"Table {table_name} ({cols})")
    return " ".join(serialized)

def get_schema_text(row):
    db_id = row['db_id']
    if db_id in schema_map:
        return serialize_schema(schema_map[db_id])
    else:
        return ""

df['schema'] = df.apply(get_schema_text, axis=1)

df.to_csv('spider_train_with_separate_schema_column.csv', index=False)

print("CSV with separate 'schema' column saved as 'spider_train_with_separate_schema_column.csv'")


CSV with separate 'schema' column saved as 'spider_train_with_separate_schema_column.csv'


In [9]:
df.head()

Unnamed: 0,db_id,input,output,schema
0,department_management,translate to SQL: How many heads of the depart...,SELECT count(*) FROM head WHERE age > 56,"Table department (Department_ID, Name, Creatio..."
1,department_management,"translate to SQL: List the name, born state an...","SELECT name , born_state , age FROM head ORD...","Table department (Department_ID, Name, Creatio..."
2,department_management,"translate to SQL: List the creation year, name...","SELECT creation , name , budget_in_billions ...","Table department (Department_ID, Name, Creatio..."
3,department_management,translate to SQL: What are the maximum and min...,"SELECT max(budget_in_billions) , min(budget_i...","Table department (Department_ID, Name, Creatio..."
4,department_management,translate to SQL: What is the average number o...,SELECT avg(num_employees) FROM department WHER...,"Table department (Department_ID, Name, Creatio..."


In [10]:
df.sample(n=10)

Unnamed: 0,db_id,input,output,schema
3860,insurance_policies,"translate to SQL: Among all the claims, which ...","SELECT Date_Claim_Made , Date_Claim_Settled F...","Table Customers (Customer_ID, Customer_Details..."
6336,e_government,translate to SQL: Find the name of organizatio...,SELECT organization_name FROM organizations WH...,"Table Addresses (address_id, line_1_number_bui..."
5852,tracking_share_transactions,translate to SQL: Show the average amount of t...,SELECT avg(amount_of_transaction) FROM TRANSAC...,"Table Investors (investor_id, Investor_details..."
6420,cre_Docs_and_Epenses,translate to SQL: Return the code of the docum...,SELECT document_type_code FROM Documents GROUP...,"Table Ref_Document_Types (Document_Type_Code, ..."
5765,dorm_1,translate to SQL: What is the first name and a...,"SELECT T1.fname , T1.age FROM student AS T1 J...","Table Student (StuID, LName, Fname, Age, Sex, ..."
1123,climbing,translate to SQL: Return the countries of the ...,SELECT Country FROM mountain WHERE Height > ...,"Table mountain (Mountain_ID, Name, Height, Pro..."
2030,gas_company,translate to SQL: What are the locations that ...,SELECT T3.location FROM station_company AS T1 ...,"Table company (Company_ID, Rank, Company, Head..."
314,product_catalog,translate to SQL: What are the name and public...,"SELECT t1.catalog_name , t1.date_of_publicati...","Table Attribute_Definitions (attribute_id, att..."
4049,student_1,translate to SQL: What are the first names of ...,SELECT DISTINCT T2.firstname FROM list AS T1 J...,"Table list (LastName, FirstName, Grade, Classr..."
6334,e_government,translate to SQL: Find the payment method code...,SELECT payment_method_code FROM parties GROUP ...,"Table Addresses (address_id, line_1_number_bui..."
