In [None]:
# Load variables from setup notebook
%store -r CUSTOM_ENDPOINT_NAME MLFLOW_TRACKING_URI account_id bucket_name region db_name s3_output

# Evaluating Fine-Tuned Qwen3 Model for SQL Generation

This notebook evaluates the fine-tuned performance of the Qwen3 0.6B model for text-to-SQL generation tasks. We'll test the model's ability to generate SQL queries and assess its performance across different difficulty levels.

**Prerequisites**: Run the setup notebook (0-setup.ipynb) and training notebook (2-model-finetuning.ipynb) first.

## 1. Test Model Inference

In [None]:
import sagemaker 

predictor = sagemaker.Predictor(
    endpoint_name=CUSTOM_ENDPOINT_NAME,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

predictor.predict(
    {
        "messages": [
            {"role": "user", "content": "What is the capital of France?"}
        ]
    }
)

## 2. SQL Generation Test


In [None]:
from utils.evaluation import (
    execute_athena_query,
    extract_sql,
    collect_athena_metrics,
    generate_sql,
    analyze_qwen_results,
)

In [None]:
schema = open("utils/data_schema.md", "r").read()
system = f"""\
You are an expert SQL developer. Given the provided database schema and the following user question, generate a syntactically correct SQL query. 
Only reply with the SQL query, nothing else. Do NOT use the backticks to identify the code, just reply with the pure SQL query.

{schema}
"""

question = "Calculate the moving average of sales using a 5-row window (2 preceding, 2 following, current row) ordered by row_id"
payload = {
    "messages": [
        {"role": "system", "content": system},
        {"role": "user", "content": question+" /no_think"},
    ]
}

res = predictor.predict(payload)
output = extract_sql(res["choices"][0]["message"]["content"])
print(output)

In [None]:
# Verify database access
execute_athena_query("SELECT COUNT(*) FROM orders", db_name, s3_output)
execute_athena_query("SELECT COUNT(*) FROM returns", db_name, s3_output)

## 3. Comprehensive Evaluation


In [None]:
import json

# Load evaluation dataset
data = []
with open('eval_sql.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} evaluation queries")

<div class="alert alert-block alert-info">
  <center><b>‚ö†Ô∏èÔ∏è Important ‚ö†Ô∏èÔ∏è</b></center>
  The cell below takes <b>~5 minutes to run</b>. We recommend <b>executing the cell below, then proceed to notebook 4</b>. Come back later to this notebook to view the custom model evaluation results.
</div>

In [None]:
import pandas as pd
from tqdm import tqdm
import boto3


smr_client = boto3.client("sagemaker-runtime")

# Generate SQL queries for evaluation
print("Generating SQL queries for evaluation ... ")
for i, item in tqdm(enumerate(data), total=len(data)):
    data[i]['qwen_ft_sql_query'] = generate_sql(item['question']+" /no_think", CUSTOM_ENDPOINT_NAME, smr_client)

# Execute queries and collect metrics
print("Executing queries and collecting metrics ... ")
all_metrics = []
for item in tqdm(data, total=len(data)):
    metrics = collect_athena_metrics(
        sql_query=item["qwen_ft_sql_query"],
        db_name=db_name,
        s3_output=s3_output,
        query_id=item["id"],
    )
    all_metrics.append(metrics)

# Save results
df = pd.DataFrame(data)
df.to_json('results/eval_sql_qwen_ft.json', orient='records', indent=2)

with open('results/qwen3_ft_results.json', 'w') as f:
    json.dump(all_metrics, f, indent=2)

print(f"Processed {len(all_metrics)} queries")

## 4. Performance Analysis


In [None]:
print("=== FINE-TUNED MODEL PERFORMANCE ===")
df_ft, summary_ft, metrics_ft = analyze_qwen_results(
    'results/qwen3_ft_results.json', 'results/eval_sql_qwen_ft.json'
)

In [None]:
import mlflow
from utils.mlflow_tracking import get_mlfow_url
from IPython.display import Markdown
import os
from contextlib import redirect_stdout

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment('qwen3-spectrum-experiment')

# Redirect standard output to os.devnull (or a dummy file object)
with redirect_stdout(open(os.devnull, 'w')):
    with mlflow.start_run(run_name='ft-model-evaluation') as run:
        mlflow.log_metrics(metrics_ft)
Markdown(get_mlfow_url("aim410-mlflow-server"))


When running the fine-tuning at scale, we have obtained the following results:

![fine-tuned-model-performances.png](./images/fine-tuned-model-performances.png)

<div class="alert alert-block alert-info">
Looking for a more in-depth comparison between the two models? Go ahead and run the notebook <code>optional/3-evaluation-with-ragas.ipynb</code>, which leverages <a href="https://ragas.io"><b>ragas</b></a>, an OSS framework for model and agent evaluation. <b>Warning:</b> running that notebook end-to-end takes ~15 minutes.
</div>

----

You're done! üöÄÔ∏è You can now proceed to notebook `4-agents.ipynb`.