<div style="background-color: #ADD8E6; border: 1px solid gray; padding: 3px">
    <h3>Code Spec Evaluation</h3>
    The following is an overview of the workflow:
    <ul>
    <li>Uses <b>DeepEval</b> to evaluate the code specifications doc</li>
    <li>Uses specification generated by gpt-oss-20b as reference set</li>
    </ul>
</div>

In [1]:
##############################################
# Imports
##############################################
import os
import traceback
from openai import OpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import assert_test, evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval, AnswerRelevancyMetric
from evaluate import load
from deepeval.metrics import ArenaGEval
from deepeval.test_case import ArenaTestCase
from deepeval import compare
import utils
import nltk
import pprint
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv()
import nest_asyncio
nest_asyncio.apply()

In [2]:
##############################################
# Set Up Metric Instances
##############################################
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
bleu_metric = load("bleu")
rouge_metric = load("rouge")
meteor_metric = load("meteor")
bertscore_metric = load("bertscore")

[nltk_data] Downloading package wordnet to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
##############################################
# Utilities
##############################################
def get_file_content(filepath: str) -> str:
    with open(filepath, "r") as file:
        return file.read()

In [4]:
##############################################
# Evaluator Tools
##############################################
class CustomLLM(DeepEvalBaseLLM):
    def __init__(self, client, model_name):
        self.client = client
        self.model_name = model_name

    def load_model(self):
        return self.client

    def generate(self, prompt: str) -> str:
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return response.choices[0].message.content

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return self.model_name

In [5]:
##############################################
# Referenc _Based Evaluation Methods
##############################################
def compute_reference_based_eval_scores(predictions, references):
    def compute_bleu4_scores(predictions, references):
        """
        Computes BLEU-4 Scores
        """
        results = bleu_metric.compute(predictions=predictions, references=references)
        return results["bleu"]
    
    def compute_rouge_scores(predictions, references):
        """
        Computes ROUGE Scores
        """
        results = rouge_metric.compute(predictions=predictions, references=references)
        return results["rougeL"]
    
    def compute_meteor_scores(predictions, references):
        """
        Computes METEOR Scores
        """
        results = meteor_metric.compute(predictions=predictions, references=references)
        return results["meteor"]
        
    
    def compute_bert_scores(predictions, references):
        """
        Computes BERT Scores
        """
        results = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
        return results["f1"]

    return {
        "bleu4": compute_bleu4_scores(predictions, references),
        "rougel": compute_rouge_scores(predictions, references),
        "meteor": compute_meteor_scores(predictions, references),
        "bert": compute_bert_scores(predictions, references)
    }

In [6]:
##############################################
# Benchmark Evaluation Methods
##############################################
def compute_benchmark_eval_scores(app_name: str):
    try:

        csv_results_file = f"spec/{app_name}/benchmark.csv"

        data = pd.read_csv(csv_results_file).fillna("")
        
        records = data.to_dict(orient='records')

        evaluatorLlm = CustomLLM(client = OpenAI(api_key=os.getenv('REFERENCE_LLM_TOKEN'),base_url=os.getenv('REFERENCE_LLM_API_BASE')),
                              model_name = os.getenv("REFERENCE_LLM_ID"))

        test_cases = [LLMTestCase(input=str(record["baseline"]), actual_output=str(record["candidate"])) for record in records]

        metrics = [AnswerRelevancyMetric(threshold=0.7, model=evaluatorLlm)]

        results = evaluate(test_cases=test_cases, metrics=metrics)

        result_scores_relevancy = [metric.score for result in results.test_results for metric in result.metrics_data if metric.name=="Answer Relevancy"]

        result_reasons_relevancy = [metric.reason for result in results.test_results for metric in result.metrics_data if metric.name=="Answer Relevancy"]

        data["eval_scores_relevancy"] = result_scores_relevancy

        data["eval_reasons_relevancy"] = result_reasons_relevancy

        return data

    except Exception as e:

        traceback.print_exc()
    

### Run Evaluations
Run the evaluations pipeline!

In [7]:
def evaluations_pipeline(git_repo: str):
    try:

        app_name = utils.get_unique_app_name_for_repo(git_repo)

        results = compute_benchmark_eval_scores(app_name)

        results.to_csv(f"spec/{app_name}/benchmark_results.csv")

    except Exception as e:

        traceback.print_exc()

In [8]:
import os
git_repo_param = os.getenv("PIPELINE_PARAM_GIT_REPO", "https://github.com/holtonma/cf_golfap.git")
enabled_param = os.getenv("PIPELINE_PARAM_ENABLED", "true")
if enabled_param == "true":
    eval_results = evaluations_pipeline(git_repo_param)
    eval_results

Output()



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 0.9090909090909091, threshold: 0.7, strict: False, evaluation model: openai/gpt-oss-20b, reason: The score is 0.91 because the output is largely relevant to the description of Golfap, but it includes an unrelated mention of a social score‚Äëkeeping and betting feature, which slightly lowers the overall relevancy., error: None)

For test case:

  - input: Golfap is a lightweight **Golf News & Tournament** web application.
* Surfaces a daily feed of golf headlines and related metadata (links, summaries, categories).
* Provides a tournament‚Äëand‚Äëplayer overview, allowing users to inspect individual player statistics per event.
* Uses ColdFusion Markup Language (CFML) with a **JSON** utility component for serializing query‚Äëresult sets to JSON for client‚Äëside consumption.
Altogether the code collects, formats, and presents golf news, tournament leaderboards, and golfer performance data to a web browser.
  - actual output: Golfap is 

Unnamed: 0,code,prompt,baseline,candidate,eval_scores_relevancy,eval_reasons_relevancy
0,,Provide a clear and concise explanation of the...,Golfap is a lightweight **Golf News & Tourname...,Golfap is a social golf score‚Äëkeeping web appl...,0.909091,The score is 0.91 because the output is largel...
1,,What are the client-side frameworks used by th...,"Prototype, Scriptaculous and a minimal portal ...","Prototype.js, Effects, Builder, DragDrop, Port...",0.857143,The score is 0.86 because the answer correctly...
2,news/tabs.cfm,What is the responsibility of `news/tabs.cfm` ...,‚ÄùNavigation bar and header layout for both new...,reusable tab navigation component.,1.0,The score is 1.00 because the response fully a...
3,news/header.cfm,What is the responsibility of `news/header.cfm...,‚ÄùIncludes CSS and `tabs.cfm`; used by most pag...,"reusable header markup, includes `OBJECT PAGE_...",0.0,The score is 0.00 because the response only co...
4,news/index.cfm,What is the responsibility of `news/index.cfm`...,"‚ÄùHome page for the news site; displays ads, da...",landing page for the news section.,1.0,The score is 1.00 because the output fully add...
5,news/leaderboards.cfm,What is the responsibility of `news/leaderboar...,‚ÄùLeaderboard table for the tournaments site; d...,enders global leaderboard tables.,1.0,The score is 1.00 because the response fully a...
6,news/page.cfm,What is the responsibility of `news/page.cfm` ...,‚ÄùRepeated content used across news pages.‚Äù,"wrapper page for news content, includes `PAGE_...",0.333333,The score is 0.33 because the answer contains ...
7,news/getNews.cfm,What is the responsibility of `news/getNews.cf...,"‚ÄùRetrieves real news data from the database, f...",retrieves news items via `QGETNEWS_QUERY`,1.0,The score is 1.00 because the output directly ...
8,news/getNews_mock.cfm,What is the responsibility of `news/getNews_mo...,‚ÄùSame as `getNews.cfm` but with a static mock ...,provides mock news data for testing.,1.0,The score is 1.00 because the output fully add...
9,news/add_users.cfm,What is the responsibility of `news/add_users....,‚ÄùDemo forms showing how to collect user logins...,handles user registration and group creation.,1.0,The score is 1.00 because the response fully a...
