<a href="https://colab.research.google.com/github/brijeshrn/AI/blob/main/Text2SQLGraph_v1_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

97% with table_schema1 file

In [None]:
from google.colab import files
uploaded = files.upload()

Saving table_schema1.yaml to table_schema1 (2).yaml


In [None]:
import yaml

# Use the actual filename you uploaded (typically same as the original)
with open("table_schema1 (2).yaml", "r", encoding="utf-8") as f:
    meta_dict = yaml.safe_load(f)

In [None]:
import yaml
from pprint import pprint  # For nice nested structure

# Print structure
pprint(meta_dict)

{'tables': {'all_star': {'aliases': ['asg',
                                     'allstar',
                                     'all_star_game',
                                     'allstars'],
                         'columns': ['player_id',
                                     'year',
                                     'game_num',
                                     'team_id',
                                     'league_id',
                                     'starting_pos'],
                         'description': 'Details the records of baseball '
                                        'players selected as All-Stars in '
                                        'various years and their participation '
                                        'in All-Star games.',
                         'example_row': {'game_num': 1,
                                         'league_id': 'AL',
                                         'player_id': 'baberu01',
                                

In [None]:
def extract_table_embedding_info(meta_dict):
    """
    Given a schema metadata dictionary with a 'tables' key,
    extract info for table-level embeddings.
    Returns a list of dicts (one per table) with canonical name, description,
    synonyms, aliases, and tags as a joined embedding text.
    """
    table_embedding_items = []
    for table, meta in meta_dict['tables'].items():  # <-- Fix is here!
        desc = meta.get('description', '')
        synonyms = meta.get('synonyms', [])
        aliases = meta.get('aliases', [])
        tags = meta.get('tags', [])
        # Create a single string for embedding
        embedding_text = (
            f"Table: {table}. "
            f"Description: {desc}. "
            f"Synonyms: {', '.join(synonyms)}. "
            f"Aliases: {', '.join(aliases)}. "
            f"Tags: {', '.join(tags)}."
        )
        table_embedding_items.append({
            'table': table,
            'description': desc,
            'synonyms': synonyms,
            'aliases': aliases,
            'tags': tags,
            'embedding_text': embedding_text
        })
    return table_embedding_items

# Example usage:
table_info_list = extract_table_embedding_info(meta_dict)
for t in table_info_list:
    print(t['embedding_text'])
print("Number of tables:", len(table_info_list))  # This will print the table count


Table: all_star. Description: Details the records of baseball players selected as All-Stars in various years and their participation in All-Star games.. Synonyms: All-Star records, All-Star appearances, All-Star game participants. Aliases: asg, allstar, all_star_game, allstars. Tags: baseball, All-Star game, sports, player participation.
Table: appearances. Description: Player participation in various positions for each game and season. Contains stats like games played in each fielding position, for every player-year-team.. Synonyms: player appearances, fielding appearances, games played. Aliases: apps, games_played, appear. Tags: baseball, fielding, player participation.
Table: batting. Description: Historical batting statistics for baseball players, recording their performance in various seasons.. Synonyms: hitting statistics, batting records. Aliases: bat, batstats, batting_stats. Tags: baseball, sports, statistics.
Table: batting_postseason. Description: Postseason batting statisti

In [None]:
import networkx as nx

def build_schema_graph(meta_dict):
    G = nx.DiGraph()
    for table, meta in meta_dict['tables'].items():
        G.add_node(table)
        for fk in meta.get('foreign_keys', []):
            ref_table = fk['ref_table']
            # Add edge FROM this table TO the referenced table
            G.add_edge(table, ref_table, column=fk['column'], ref_column=fk['ref_column'])
    return G

# Usage
G = build_schema_graph(meta_dict)
print("Tables in graph:", G.nodes)
print("Edges (FKs):", G.edges(data=True))
print("Number of tables (nodes):", G.number_of_nodes())
print("Number of foreign key edges:", G.number_of_edges())
print("List of tables (nodes):", list(G.nodes))
print("List of foreign key edges:", list(G.edges(data=True)))

Tables in graph: ['all_star', 'player', 'team', 'appearances', 'batting', 'batting_postseason', 'college', 'fielding', 'fielding_outfield', 'fielding_postseason', 'hall_of_fame', 'home_game', 'park', 'league', 'manager', 'manager_award', 'manager_award_vote', 'manager_half', 'pitching', 'pitching_postseason', 'player_award', 'player_award_vote', 'player_college', 'postseason', 'salary', 'team_franchise', 'team_half']
Edges (FKs): [('all_star', 'player', {'column': 'player_id', 'ref_column': 'player_id'}), ('all_star', 'team', {'column': 'league_id', 'ref_column': 'league_id'}), ('team', 'league', {'column': 'league_id', 'ref_column': 'league_id'}), ('appearances', 'player', {'column': 'player_id', 'ref_column': 'player_id'}), ('appearances', 'team', {'column': 'league_id', 'ref_column': 'league_id'}), ('batting', 'player', {'column': 'player_id', 'ref_column': 'player_id'}), ('batting', 'team', {'column': 'league_id', 'ref_column': 'league_id'}), ('batting_postseason', 'player', {'colu

In [None]:
# ---------- Configuration ----------
from openai import OpenAI
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_KEY')
 #---------- Client Setup ----------
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
import openai

# If using OpenAI v1.x (latest as of 2024)
def get_openai_embeddings(texts, client, model="text-embedding-3-large", batch_size=20, show_progress=True):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            # This is the correct API call
            response = client.embeddings.create(
                input=batch,
                model=model
            )
            batch_embeddings = [d.embedding for d in response.data]
            embeddings.extend(batch_embeddings)
            if show_progress:
                print(f"✅ Embedded [{i+1}-{i+len(batch)}] of {len(texts)}")
        except Exception as e:
            print(f"❌ Batch {i+1}-{i+len(batch)} failed: {e}")
    import numpy as np
    return np.array(embeddings)


In [None]:
embedding_texts = [t['embedding_text'] for t in table_info_list]
print("Number of embedding texts:", len(embedding_texts))
for i, text in enumerate(embedding_texts):
    print(f"{i+1}: {text[:80]}...")  # Preview first 80 chars of each

Number of embedding texts: 27
1: Table: all_star. Description: Details the records of baseball players selected a...
2: Table: appearances. Description: Player participation in various positions for e...
3: Table: batting. Description: Historical batting statistics for baseball players,...
4: Table: batting_postseason. Description: Postseason batting statistics for baseba...
5: Table: college. Description: Identifies colleges and links to players via player...
6: Table: fielding. Description: Fielding statistics for baseball players, covering...
7: Table: fielding_outfield. Description: Fielding appearances of outfield players,...
8: Table: fielding_postseason. Description: Postseason fielding statistics for base...
9: Table: hall_of_fame. Description: Records entries related to baseball players' n...
10: Table: home_game. Description: Information about home games played by baseball t...
11: Table: league. Description: Describes the professional baseball leagues, such as...
12: Table: 

In [None]:
# --- Usage Example ---
embeddings = get_openai_embeddings(
    embedding_texts,         # your list of texts
    client,                  # your OpenAI client object
    model="text-embedding-3-large",
    batch_size=20,
    show_progress=True
)
print(embeddings.shape)


✅ Embedded [1-20] of 27
✅ Embedded [21-27] of 27
(27, 3072)


In [None]:
from google.colab import files
uploaded = files.upload()

Saving nl_gold_sql_queries.csv to nl_gold_sql_queries.csv


In [None]:
import pandas as pd

# Replace with the exact filename as shown after upload
df = pd.read_csv('nl_gold_sql_queries.csv')

# See the first 5 rows
df.head()

Unnamed: 0,Index,NLQ,GOLD_SQL
0,1,what is the full name and id of the college wi...,"SELECT T1.name_full , T1.college_id FROM coll..."
1,2,Find the full name and id of the college that ...,"SELECT T1.name_full , T1.college_id FROM coll..."
2,3,What is average salary of the players in the t...,SELECT avg(T1.salary) FROM salary AS T1 JOIN t...
3,4,Compute the average salary of the players in t...,SELECT avg(T1.salary) FROM salary AS T1 JOIN t...
4,5,What are first and last names of players parti...,"SELECT name_first , name_last FROM player AS ..."


In [None]:
# --- Semantic table retrieval ---
def retrieve_tables_from_nlq(nlq, embeddings, table_names, embed_fn, client, top_n=5):
    nlq_emb = embed_fn([nlq], client=client)[0]
    sims = np.dot(embeddings, nlq_emb) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(nlq_emb))
    top_indices = np.argsort(sims)[-top_n:][::-1]
    result = [(table_names[i], sims[i]) for i in top_indices]
    return result

def expand_tables_graph(G, tables, hops=1):
    expanded = set(tables)
    UG = G.to_undirected()
    for table in tables:
        nodes = nx.single_source_shortest_path_length(UG, table, cutoff=hops).keys()
        expanded.update(nodes)
    return expanded

In [None]:
import pandas as pd
import numpy as np

# Assume you have:
# - df: a DataFrame with columns ['NLQ', 'GOLD_SQL', ...]
# - meta_dict: dict with 'tables'
# - G: networkx DiGraph (knowledge graph)
# - retrieve_tables_from_nlq: function for top-N table retrieval by embeddings
# - expand_tables_graph: function for 1-hop table expansion

schema_tables = list(meta_dict['tables'].keys())

def get_gold_tables_from_sql(sql, schema_tables):
    return sorted([t for t in schema_tables if t in sql])

df['INITIAL_TABLES'] = ''
df['EXPANDED_TABLES'] = ''
df['GOLD_TABLES'] = ''

for idx, row in df.iterrows():
    nlq = row['NLQ']
    gold_sql = row['GOLD_SQL']

    # Retrieve top N tables by semantic similarity
    top_tables = retrieve_tables_from_nlq(
        nlq,
        embeddings,
        schema_tables,
        embed_fn=get_openai_embeddings,
        client=client,
        top_n=5
    )
    initial_tables = [t[0] for t in top_tables]
    expanded_tables = sorted(expand_tables_graph(G, initial_tables, hops=1))

    gold_tables = get_gold_tables_from_sql(gold_sql, schema_tables)

    df.at[idx, 'INITIAL_TABLES'] = ", ".join(initial_tables)
    df.at[idx, 'EXPANDED_TABLES'] = ", ".join(expanded_tables)
    df.at[idx, 'GOLD_TABLES'] = ", ".join(gold_tables)

# Display only the columns of interest
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)
print(df[['GOLD_TABLES', 'INITIAL_TABLES', 'EXPANDED_TABLES']].to_string(index=False))


✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded

In [None]:
import re
import pandas as pd
import ast

def build_relationships_block(meta_dict):
    rels = []
    for tname, tinfo in meta_dict['tables'].items():
        for fk in tinfo.get('foreign_keys', []):
            rels.append(
                f"'{tname}' links to '{fk['ref_table']}' via {tname}.{fk['column']} = {fk['ref_table']}.{fk['ref_column']}"
            )
    return "\n".join(rels)

def build_few_shot_examples():
    return """
Example 1 (EXACT):
User Question: "How many parks are there in the state of NY?"
Tables: ['park']
Explanation: Only the 'park' table is needed because state information is stored there.

Example 2 (OVER-SELECTION):
User Question: "How many players born in USA are right-handed batters?"
Tables: ['player']
Explanation: Although both 'player' and 'batting' mention batting, the 'bats' attribute is in 'player'. Do NOT include 'batting' table.

Example 3 (OVER-SELECTION):
User Question: "Which cities have 2 to 4 parks?"
Tables: ['park']
Explanation: City and park info are both in 'park'. Do NOT select extra tables.

Example 4 (EXACT):
User Question: "Which team offers the lowest average salary?"
Tables: ['salary', 'team']
Explanation: Need 'salary' for pay info and 'team' for name/id. Only these two tables are required.

Example 5 (EXACT):
User Question: "List the names of states that have more than 2 parks."
Tables: ['park']
Explanation: The count and state data both come from the 'park' table.

Example 6 (UNDER-SELECTION):
User Question: "How many players did Boston Red Stockings have in 2000?"
Tables: ['salary', 'team']
Explanation: 'salary' has player and team IDs, but the team name 'Boston Red Stockings' is in 'team', so both are required.

"""


def build_llm_prompt(nlq, prompt_tables_block, relationships_block, schema_tables, few_shot_examples=""):
    return f"""
You are a data agent helping to translate user questions into the *smallest* set of required tables from a SQL database.

Instructions:
- From the tables and their relationships below, select ONLY the minimal set of tables needed to answer the question.
- DO NOT include extra tables unless they are required for joins.
- List only table names present in the schema.
- Output only a valid Python list of table names, nothing else.
- After the list, explain your choice (1-2 sentences max).

Schema Tables: {', '.join(schema_tables)}

{few_shot_examples}

Schema Tables and Metadata:
{prompt_tables_block}

Relationships (joins, foreign keys, etc.):
{relationships_block}

User Question: "{nlq}"

Your Answer:
Tables:
"""

def extract_tables_from_llm_output(text, schema_tables):
    # Try to extract a Python list first (for strict parsing)
    try:
        match = re.search(r"\[.*?\]", text, flags=re.DOTALL)
        if match:
            parsed = ast.literal_eval(match.group(0))
            # Keep only valid schema tables (case-insensitive)
            schema_set = set([t.lower() for t in schema_tables])
            table_map = {t.lower(): t for t in schema_tables}
            result = [table_map[t.lower()] for t in parsed if t.lower() in schema_set]
            return ", ".join(result)
    except Exception:
        pass

    # Fallback: regex/line-based
    schema_set = set([t.lower() for t in schema_tables])
    found_tables = set()
    m = re.search(r'Tables?:\s*\[?([^\]\n]+)\]?', text, flags=re.IGNORECASE)
    if m:
        raw = m.group(1)
        for token in re.split(r"[,\n]", raw):
            t = token.strip().strip("'\"").lower()
            if t in schema_set:
                found_tables.add(t)
    if not found_tables:
        words = re.findall(r"\b\w+\b", text)
        for w in words:
            if w.lower() in schema_set:
                found_tables.add(w.lower())
    table_map = {t.lower(): t for t in schema_tables}
    return ", ".join([table_map[t] for t in sorted(found_tables)]) if found_tables else ""

def build_table_block(table, meta):
    block = f"Table: {table}\n"
    block += f"Description: {meta.get('description', '')}\n"
    block += f"Synonyms: {', '.join(meta.get('synonyms', []))}\n"
    block += f"Aliases: {', '.join(meta.get('aliases', []))}\n"
    block += f"Tags: {', '.join(meta.get('tags', []))}\n"
    if 'instructions' in meta:
        block += f"Instructions: {meta['instructions']}\n"
    if 'foreign_keys' in meta and meta['foreign_keys']:
        fks = []
        for fk in meta['foreign_keys']:
            fks.append(f"{fk['column']} → {fk['ref_table']}.{fk['ref_column']}")
        block += f"Foreign Keys: {', '.join(fks)}\n"
    return block

def call_llm_table_selection(prompt, client=None, model="gpt-4o", max_tokens=512):
    """Send the prompt to an LLM and return the response text."""
    if client is None:
        import openai
        client = openai.OpenAI()  # Or openai.Client() depending on your SDK version
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in SQL, databases, and schema design. Given a set of tables and a user question, select the minimal relevant tables required to answer the question, and explain your rationale. Output tables as a list."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0.0
    )
    return response.choices[0].message.content.strip()

# === MAIN PIPELINE ===

df['LLM_TABLES'] = ''
df['LLM_REASON'] = ''

schema_tables = list(meta_dict['tables'].keys())
relationships_block = build_relationships_block(meta_dict)
few_shot_examples = build_few_shot_examples()



In [None]:
# === Anti-pattern & schema selection rules ===
schema_selection_instructions = """
Schema Selection Instructions:
- Do not include generic tables like `player`, `manager`, `appearances`, or `batting` unless their columns are explicitly required.
- Use `salary` to join player-team-year if the query asks about team rosters, counts, or salaries.
- Use `player` only if demographic details are requested (e.g., name, height, country).
- Avoid including `manager` unless querying manager metadata (e.g., birth info).
- Avoid including `batting` unless querying season stats (hits, at-bats, home runs).
- Use `home_game` over `appearances` for attendance or game participation queries.
- Use `team` and `team_franchise` only if franchise-related or name-based logic is involved.
"""

# === Main loop for table selection ===
for idx, row in df.iterrows():
    nlq = row['NLQ']

    # Step 1: Retrieve top semantically relevant tables and expand with graph
    top_tables = retrieve_tables_from_nlq(
        nlq,
        embeddings,
        schema_tables,
        embed_fn=get_openai_embeddings,
        client=client,
        top_n=5
    )
    initial_tables = [t[0] for t in top_tables]
    expanded_tables = sorted(expand_tables_graph(G, initial_tables, hops=1))
    all_tables = sorted(set(initial_tables + expanded_tables))

    # Step 2: Build metadata block for these tables (with schema instructions)
    prompt_tables_block = schema_selection_instructions.strip() + "\n\n"
    for t in all_tables:
        if t in meta_dict['tables']:
            prompt_tables_block += build_table_block(t, meta_dict['tables'][t]) + "\n"

    # Step 3: Build prompt
    prompt = build_llm_prompt(
        nlq,
        prompt_tables_block,
        relationships_block,
        schema_tables,
        few_shot_examples
    )

    # Step 4: Call LLM for table selection
    llm_output = call_llm_table_selection(prompt, client=client, model="gpt-4o")

    # Step 5: Robustly extract tables
    tables_out = extract_tables_from_llm_output(llm_output, schema_tables)

    # Step 6: Extract explanation
    explanation_index = llm_output.lower().find('explanation')
    reason_out = llm_output[explanation_index:].strip() if explanation_index >= 0 else llm_output

    # Step 7: Store results
    df.at[idx, 'LLM_TABLES'] = tables_out
    df.at[idx, 'LLM_REASON'] = reason_out

# Display result
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)
print(df[['NLQ', 'GOLD_SQL', 'LLM_TABLES', 'LLM_REASON']].to_string(index=False))


✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded [1-1] of 1
✅ Embedded

In [None]:
import pandas as pd
import re

# Example: Load your dataframe
# df = pd.read_csv("your_file.csv")

def extract_tables_from_sql(sql):
    """
    Very basic regex to extract table names from a SQL query.
    Will work for most Spider-style queries but may need improvement for edge cases.
    """
    # Match FROM <table> or JOIN <table> and get the table name (ignores alias)
    pattern = r'(?:FROM|JOIN)\s+([a-zA-Z0-9_]+)'
    tables = re.findall(pattern, sql, re.IGNORECASE)
    return sorted(set([t.strip().lower() for t in tables]))

def normalize_tables(tables_str):
    """
    Given a string like 'salary, team, player', returns a sorted list of table names.
    """
    return sorted([t.strip().lower() for t in tables_str.split(',') if t.strip()])

def evaluate_table_selection(df, gold_sql_col="GOLD_SQL", llm_tables_col="LLM_TABLES"):
    exact = 0
    over = 0
    under = 0
    total = len(df)
    print("\nEvaluation Results:\n" + "-"*70)
    for idx, row in df.iterrows():
        gold_tables = set(extract_tables_from_sql(row[gold_sql_col]))
        llm_tables = set(normalize_tables(row[llm_tables_col]))

        is_exact = gold_tables == llm_tables
        is_over = llm_tables > gold_tables
        is_under = not gold_tables.issubset(llm_tables)
        if is_exact:
            exact += 1
        elif is_over and not is_under:
            over += 1
        elif is_under:
            under += 1

        print(f"\nNLQ: {row['NLQ']}")
        print(f"Gold SQL: {row[gold_sql_col]}")
        print(f"Gold tables: {sorted(gold_tables)}")
        print(f"LLM tables : {sorted(llm_tables)}")
        print(f"Result: {'EXACT' if is_exact else 'OVER' if is_over else 'UNDER'}")

    print("\nSummary:")
    print(f"Total:         {total}")
    print(f"Exact match:   {exact} ({exact/total:.1%})")
    print(f"Over selection:{over} ({over/total:.1%})")
    print(f"Under selection:{under} ({under/total:.1%})")

# Example usage:
evaluate_table_selection(df)



Evaluation Results:
----------------------------------------------------------------------

NLQ: what is the full name and id of the college with the largest number of baseball players?
Gold SQL: SELECT T1.name_full ,  T1.college_id FROM college AS T1 JOIN player_college AS T2 ON T1.college_id  =  T2.college_id GROUP BY T1.college_id ORDER BY count(*) DESC LIMIT 1;
Gold tables: ['college', 'player_college']
LLM tables : ['college', 'player_college']
Result: EXACT

NLQ: Find the full name and id of the college that has the most baseball players.
Gold SQL: SELECT T1.name_full ,  T1.college_id FROM college AS T1 JOIN player_college AS T2 ON T1.college_id  =  T2.college_id GROUP BY T1.college_id ORDER BY count(*) DESC LIMIT 1;
Gold tables: ['college', 'player_college']
LLM tables : ['college', 'player_college']
Result: EXACT

NLQ: What is average salary of the players in the team named 'Boston Red Stockings' ?
Gold SQL: SELECT avg(T1.salary) FROM salary AS T1 JOIN team AS T2 ON T1.team_id