# Eval

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#|default_exp evaluate

In [None]:
#|export
import braintrust
import ast
import fastcore.all as fc
from dotenv import load_dotenv
from tqdm import tqdm
from wattbot import retriever, eda, generator, evaluate, utils

In [None]:
load_dotenv()

True

## Score Answer Value

In [None]:
#|export
def is_numeric(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

In [None]:
fc.test_eq(is_numeric("1.0"), True)

fc.test_eq(is_numeric("a"), False)

In [None]:
#|export
def score_answer_value(predicted, expected):
    if expected == "is_blank":
        return 1.0 if predicted == "is_blank" else 0.0
    
    if isinstance(expected, str) and expected.startswith('['): expected = ast.literal_eval(expected)
    
    if is_numeric(expected) and is_numeric(predicted):
        pred_num, exp_num = map(float, (predicted, expected))
        return 1.0 if abs(pred_num - exp_num) <= abs(exp_num * 0.001) else 0.0
    else:
        return 1.0 if str(predicted).strip().lower() == str(expected).strip().lower() else 0.0

In [None]:
fc.test_eq(score_answer_value("is_blank", "is_blank"), True)

fc.test_eq(score_answer_value(["is_blank"], "is_blank"), False)

fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "The ML.ENERGY Benchmark"), False)

fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "ML.ENERGY Benchmark"), True)

fc.test_eq(score_answer_value("4.3", "4.3"), True)

fc.test_eq(score_answer_value("4.3", "4.13"), False)

## Score Ref ID

In [None]:
#|export
def score_ref_id(predicted, expected):
    if expected == "is_blank":
        return 1.0 if predicted == "is_blank" else 0.0
        
    if isinstance(expected, str) and expected.startswith('['): expected = ast.literal_eval(expected)
        
    pred_set = set(predicted) if isinstance(predicted, list) else set([predicted])
    exp_set = set(expected) if isinstance(expected, list) else set([expected])
        
    intersection = len(pred_set.intersection(exp_set))
    union = len(pred_set.union(exp_set))
        
    return intersection / union if union > 0 else 0.0

In [None]:
fc.test_eq(score_ref_id("is_blank", "is_blank"), True)

fc.test_eq(score_ref_id(["is_blank"], "is_blank"), False)

fc.test_eq(score_ref_id(["patterson2021"], ["patterson2021"]), True)

fc.test_eq(score_ref_id(["patterson2021"], '["patterson2021"]'), True)

## Score is_na

In [None]:
#|export
def score_is_na(predicted_answer, expected_answer):
    na_fields = ['answer_value', 'answer_unit', 'ref_id', 'ref_url', 'supporting_materials']
    
    expected_is_na = expected_answer['answer_value'] == 'is_blank'
    predicted_is_na = predicted_answer['answer_value'] == 'is_blank'
    
    if not expected_is_na and not predicted_is_na: return 1.0
    
    if expected_is_na and predicted_is_na:
        all_fields_blank = all(predicted_answer[field] == 'is_blank' for field in na_fields)
        return 1.0 if all_fields_blank else 0.0

    return 0.0

In [None]:
predicted_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "is_blank",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "is_blank"
}

expected_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "is_blank",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "is_blank"
}

fc.test_eq(score_is_na(predicted_answer, expected_answer), True)

In [None]:
predicted_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "0",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "The limited availability of this data significantly reduces transparency and accountability, thereby weakening the potential for public oversight and market responses."
}

fc.test_eq(score_is_na(predicted_answer, expected_answer), False)

## Wattbot Score

In [None]:
#|export
def calculate_wattbot_score(predicted_answer, expected_row):
    answer_score = score_answer_value(predicted_answer['answer_value'], expected_row['answer_value'])
    ref_score = score_ref_id(predicted_answer['ref_id'], expected_row['ref_id'])
    na_score = score_is_na(predicted_answer, expected_row)
    score = 0.75 * answer_score + 0.15 * ref_score + 0.10 * na_score
    return fc.NS(score=score, answer_score=answer_score, ref_score=ref_score, na_score=na_score)

In [None]:
predicted_answer = {
    "answer": "Unanswerable from the context.",
    "answer_unit": "MWh",
    "answer_value": "is_blank",
    "explanation": "is_blank",
    "ref_id": "is_blank",
    "ref_url": "is_blank",
    "supporting_materials": "is_blank"
}

expected_answer = {
    "answer": "Unable to answer with confidence based on the ...",
    "answer_unit": "MWh",
    "answer_value": "is_blank",
    "explanation": "is_blank",
    "ref_id": "is_blank",
    "ref_url": "is_blank",
    "supporting_materials": "is_blank"
}

ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 0.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 0.9)

In [None]:
predicted_answer = {
    "answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
    "answer_unit": "is_blank",
    "answer_value": "1",
    "explanation": "is_blank",
    "ref_id": ["khan2025"],
    "ref_url": ["https://arxiv.org/pdf/2504.06307"],
    "supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}

expected_answer = {
    "answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
    "answer_unit": "is_blank",
    "answer_value": "1",
    "explanation": "is_blank",
    "ref_id": ["khan2025"],
    "ref_url": ["https://arxiv.org/pdf/2504.06307"],
    "supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}

ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 1.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 1.0)

## Evaluation

In [None]:
all_chunks = retriever.chunk_all(retriever.chunk_doc)
ls = retriever.LexicalSearch(all_chunks)
rag = generator.RAG(ls, utils.fw(), model='accounts/fireworks/models/kimi-k2p5')

In [None]:
#|export
def evaluate_train(rag, experiment_metadata, n_rc=10):
    experiment_metadata['embedding_model'] = rag.r.model
    experiment_metadata['gen_model'] = rag.model
    experiment = braintrust.init(project="wattbot_v2_evaluate", experiment="evaluation", metadata=experiment_metadata)
    result = 0
    df = eda.train()
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
        question = row['question']
        response = rag.answer(question, n_rc=n_rc)
        expected = eda.get_value(row).__dict__
        answer = response.ans
        wattbot_score = calculate_wattbot_score(answer, expected).__dict__
        context = list(map(lambda x: x.__dict__, response.rc))
        prompt = response.pm
        experiment.log(input=question, output=answer, expected=expected, scores=wattbot_score, metadata = {'context': context, 'prompt': prompt})
        result += wattbot_score['score']
    return result

In [None]:
experiment_metadata = {
    'pdf_extraction': 'pypdf',
    'chunking': 'character_level',
    'chunk_size': 1500,
    'chunk_step': 1400,
    'retrieval': 'lexical_search'
}

evaluate_train(rag, experiment_metadata)

Processing Rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:35<00:00,  2.33s/it]


32.7

## Test

In [None]:
#|export
def create_submission(rag, experiment_metadata, n_rc=10):
    df = eda.test()
    questions = df['question'].to_list()
    experiment_metadata['embedding_model'] = rag.r.model
    experiment_metadata['gen_model'] = rag.model
    experiment = braintrust.init(project="wattbot_v2_test", experiment="test", metadata=experiment_metadata)
    for i, question in enumerate(tqdm(questions, desc="Answering question")):
        response = rag.answer(question, n_rc=n_rc)
        answer = response.ans
        df.loc[i, 'answer'] = str(answer['answer'])
        df.loc[i, 'answer_value'] = str(answer['answer_value'])
        df.loc[i, 'answer_unit'] = str(answer['answer_unit'])
        df.loc[i, 'ref_id'] = str(answer['ref_id'])
        df.loc[i, 'ref_url'] = str(answer['ref_url'])
        df.loc[i, 'supporting_materials'] = str(answer['supporting_materials'])
        df.loc[i, 'explanation'] = str(answer['explanation'])
        context = list(map(lambda x: x.__dict__, response.rc))
        prompt = response.pm
        experiment.log(input=question, output=answer, scores={'score': 0}, metadata = {'context': context, 'prompt': prompt})
        
    df = df.fillna('is_blank')
    df.to_csv(experiment_metadata['output_path'], index=False)

In [None]:
experiment_metadata['output_path'] = 'submission_v1.csv'
create_submission(rag, experiment_metadata)

  df.loc[i, 'answer'] = str(answer['answer'])
  df.loc[i, 'answer_value'] = str(answer['answer_value'])
  df.loc[i, 'ref_id'] = str(answer['ref_id'])
  df.loc[i, 'ref_url'] = str(answer['ref_url'])
  df.loc[i, 'supporting_materials'] = str(answer['supporting_materials'])
  df.loc[i, 'explanation'] = str(answer['explanation'])
Answering question: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [10:45<00:00,  2.29s/it]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()