In [None]:
!pip install oxenai

Collecting oxenai
  Downloading oxenai-0.39.1-cp312-cp312-manylinux_2_34_x86_64.whl.metadata (2.8 kB)
Collecting maturin>=1.9.3 (from oxenai)
  Downloading maturin-1.10.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl.metadata (16 kB)
Collecting pandas>=2.3.1 (from oxenai)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting polars>=1.32.0 (from oxenai)
  Downloading polars-1.35.2-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from oxenai)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting pytest-datadir>=1.8.0 (from oxenai)
  Downloading pytest_datadir-1.8.0-py3-none-any.whl.metadata (4.0 kB)
Collecting polars-runtime-32==1.35.2 (from polars>=1.32.0->oxenai)
  Downloading polars_runtime_32-1.35.2-cp39-abi3-manyl

In [None]:
import os
import pandas as pd
from oxen import RemoteRepo
import re

In [None]:
from google.colab import userdata
oxen_key = userdata.get('oxen_key')

from oxen.auth import config_auth
config_auth(oxen_key)

In [None]:
def download_data(repo_name: str, file_name: str):
    print(f"Downloading: {file_name}")
    repo = RemoteRepo(repo_name)

    if os.path.exists(file_name):
        return True

    try:
        repo.download(file_name)
        return True
    except Exception as e:
        return False

# Caculate Accuracy

In [None]:
def extract_column(df, src_col, extract_col):
    df[extract_col] = df[src_col].str.extract(f'<{extract_col}>(.*?)</{extract_col}>', flags=re.DOTALL)[0]
    df[extract_col] = df[extract_col].str.strip()
    return df

In [None]:
def extract_data(file_name: str):
    if file_name.endswith("parquet"):
        df = pd.read_parquet(file_name)
    if file_name.endswith("jsonl"):
        df = pd.read_json(file_name, lines=True)
    df = extract_column(df, 'judgement', 'answer')
    return df

In [None]:
def calculate_accuracy(df):
    agg = df['answer'].value_counts()
    true_count = agg.get('true', 0)
    total_count = len(df['answer'])
    accuracy = true_count / total_count if total_count > 0 else 0.0
    return true_count, total_count, accuracy

# Download parquet file from oxen repo and Calculate accuracy for list models

In [None]:
# Repo name in Oxen
repo_name = "ZiDuck/text2sql"

# List of models and equivalent judgement files
model_files = [
    {'name': 'GPT-4o Single-Table', 'file_path': 'results/GPT-4o-Single-Table-Judgement.parquet'},
    {'name': 'Original-Qwen3-0.6B Single-Table', 'file_path': 'results/Qwen3-0.6B-Single-Table-Predictions.parquet'},
    {'name': 'SFT-Qwen3-0.6B Single-Table', 'file_path': 'results/SFT-Qwen3-0.6B-Text2SQL-SingleTable'},
    {'name': 'GPT-4o BIRD', 'file_path': 'results/GPT-4o-Judgements_with_question.parquet'},
    {'name': 'Original-Qwen3-0.6B BIRD', 'file_path': 'results/Qwen3-0.6B-Judgements.parquet'},
    {'name': 'SFT-Qwen3-0.6B BIRD', 'file_path': 'results/SFT-Qwen3-0.6B-Judgements.parquet'},
]


In [None]:
results = []

for model in model_files:
    name = model['name']
    file_path = model['file_path']

    print(f"\n=== Evaluating model: {name} ===")

    ok = download_data(repo_name, file_path)
    if not ok:
        print(f"  → Skipped {name} (file not found or download failed).")
        continue

    try:
        df = extract_data(file_path)
        if df is None or len(df) == 0:
            print(f" ERROR: File is empty or unreadable → Skipped.")
            continue
    except Exception as e:
        print(f" ERROR reading {file_path}: {e}")
        continue

    try:
        true_count, total_count, acc = calculate_accuracy(df)
        print(f"{name}: {true_count}/{total_count} = {acc*100:.2f}%")
    except Exception as e:
        print(f"ERROR calculating accuracy: {e}")
        continue

    results.append({
        'model': name,
        'true': true_count,
        'total': total_count,
        'accuracy': acc
    })



=== Evaluating model: GPT-4o Single-Table ===
Downloading: results/GPT-4o-Single-Table-Judgement.parquet
GPT-4o Single-Table: 90/200 = 45.00%

=== Evaluating model: Original-Qwen3-0.6B Single-Table ===
Downloading: results/Qwen3-0.6B-Single-Table-Predictions.parquet
Original-Qwen3-0.6B Single-Table: 28/200 = 14.00%

=== Evaluating model: SFT-Qwen3-0.6B Single-Table ===
Downloading: results/Qwen3-0.6B.parquet
SFT-Qwen3-0.6B Single-Table: 98/200 = 49.00%

=== Evaluating model: GPT-4o BIRD ===
Downloading: results/GPT-4o-Judgements_with_question.parquet
GPT-4o BIRD: 311/500 = 62.20%

=== Evaluating model: Original-Qwen3-0.6B BIRD ===
Downloading: results/Qwen3-0.6B-Judgements.parquet
Original-Qwen3-0.6B BIRD: 33/500 = 6.60%

=== Evaluating model: SFT-Qwen3-0.6B BIRD ===
Downloading: results/SFT-Qwen3-0.6B-Judgements.parquet
SFT-Qwen3-0.6B BIRD: 10/500 = 2.00%


In [None]:
pd.DataFrame(results)

Unnamed: 0,model,true,total,accuracy
0,GPT-4o Single-Table,90,200,0.45
1,Original-Qwen3-0.6B Single-Table,28,200,0.14
2,SFT-Qwen3-0.6B Single-Table,98,200,0.49
3,GPT-4o BIRD,311,500,0.622
4,Original-Qwen3-0.6B BIRD,33,500,0.066
5,SFT-Qwen3-0.6B BIRD,10,500,0.02
