In [None]:
%pip install ragas datacompy

<div class="alert alert-block alert-info">
  <center>⚠️️ Restart the notebook kernel before proceeding!</center>
</div>

In [None]:
%store -r db_name s3_output

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

# Evaluate your SLM using Ragas

## Non-Execution Evaluation: SQL Query Semantic equivalence

**OUTPUT:** 0 if semantically different, 1 if semantically equivalent

In [None]:
import pandas as pd

base_results = "../results/eval_sql_qwen_base.json"
base_df = pd.read_json(base_results, orient="columns")

ft_results = "../results/eval_sql_qwen_ft.json"
ft_df = pd.read_json(ft_results, orient="columns")

In [None]:
# Configure Bedrock as the Evaluator LLM
from langchain_aws import ChatBedrock
from ragas.llms import LangchainLLMWrapper
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import LLMSQLEquivalence
import statistics
from tqdm import tqdm


model_id = "us.amazon.nova-pro-v1:0"   # Choose your desired model
region_name = "us-east-1"              # Choose your desired AWS region

bedrock_llm = ChatBedrock(model_id=model_id, region_name=region_name)
evaluator_llm = LangchainLLMWrapper(bedrock_llm)
scorer = LLMSQLEquivalence(llm=evaluator_llm)

<div class="alert alert-block alert-info">
  <center><b>⚠️️ Important ⚠️️</b> The cell below takes <b>~10 minutes to run</b>!</center>
</div>

In [None]:
schema = open("../utils/data_schema.md").read()

# Evaluate Base Model
base_scores = []
for test in tqdm(base_df.iterrows(), total=len(base_df)):
    sample = SingleTurnSample(
        response=test[1]['qwen_base_sql_query'],
        reference=test[1]['sql_query'],
        reference_contexts=[schema]
    )
    base_scores.append(await scorer.single_turn_ascore(sample))
base_score = statistics.mean(base_scores)

# Evaluate Fine-Tuned Model
ft_scores = []
for test in tqdm(ft_df.iterrows(), total=len(ft_df)):
    sample = SingleTurnSample(
        response=test[1]['qwen_ft_sql_query'],
        reference=test[1]['sql_query'],
        reference_contexts=[schema]
    )
    ft_scores.append(await scorer.single_turn_ascore(sample))
ft_score = statistics.mean(ft_scores)

print(f"Base Model Score: {base_score}")
print(f"Fine-Tuned Model Score: {ft_score}")

## Execution Evaluation: DataCompy Score

**OUTPUT:** F1 score of row-wise comparison - 1 if the provided Pandas DataFrames are the same

In [None]:
from utils.evaluation import collect_athena_metrics
import json

# Load evaluation dataset
data = []
with open('../eval_sql.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} evaluation queries")

In [None]:
# Execute queries and collect metrics
print("Executing ground truth queries and collecting metrics ... ")
ground_truth_metrics = []
for item in tqdm(data, total=len(data)):
    metrics = collect_athena_metrics(
        sql_query=item["sql_query"],
        db_name=db_name,
        s3_output=s3_output,
        query_id=item["id"],
    )
    ground_truth_metrics.append(metrics)

In [None]:
ground_truth_results = "../results/qwen3_gt_results.json"
ground_truth_metrics = json.load(open(ground_truth_results))

base_results = "../results/qwen3_base_results.json"
base_metrics = json.load(open(base_results))

ft_results = "../results/qwen3_ft_results.json"
ft_metrics = json.load(open(ft_results))

In [None]:
from ragas.metrics import DataCompyScore
from ragas.dataset_schema import SingleTurnSample
import numpy as np


scorer = DataCompyScore()

In [None]:
# Evaluate base model
base_scores = []
for response, reference  in tqdm(zip(base_metrics, ground_truth_metrics), total=len(base_metrics)):
    sample = SingleTurnSample(
        response=pd.DataFrame(response['result']).to_string(index=False),
        reference=pd.DataFrame(reference['result']).to_string(index=False)
    )
    base_scores.append(await scorer.single_turn_ascore(sample))
base_score = np.mean(np.nan_to_num(base_scores))

# Evaluate custom model
ft_scores = []
for response, reference  in tqdm(zip(ft_metrics, ground_truth_metrics), total=len(ft_metrics)):
    sample = SingleTurnSample(
        response=pd.DataFrame(response['result']).to_string(index=False),
        reference=pd.DataFrame(reference['result']).to_string(index=False)
    )
    ft_scores.append(await scorer.single_turn_ascore(sample))
ft_score = np.mean(np.nan_to_num(ft_scores))

print(f"Base Model Score: {base_score}")
print(f"Fine-Tuned Model Score: {ft_score}")