In [1]:
from dotenv import load_dotenv
import os
from openai import AzureOpenAI
import pandas as pd
load_dotenv()

OPENAI_API_BASE = os.getenv("AZURE_OPENAI_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")
model_name = "gpt-4.1-nano"

In [2]:
def llm_query(query : str) -> str:
    request_dict = [{"role":"developer", "content": "Répond sans phrase uniquement la réponse si possible, factuelles sans explications supplémentaires"},
                    {"role" : "user", "content" : query}]
    conn = AzureOpenAI(azure_endpoint=OPENAI_API_BASE, api_key=OPENAI_API_KEY, api_version=OPENAI_API_VERSION)
    resp = conn.chat.completions.create(messages=request_dict, model=model_name)
    return resp.choices[0].message.content



llm_query("Bonjour")

'Bonjour'

In [3]:
df = pd.read_csv("../../data/benchmark.csv")
df["llm_answer"] = [llm_query(q) for q in df["question"]]
df.to_csv("../../llm_bench.csv")

In [4]:
df

Unnamed: 0,question,answer,llm_answer
0,"Quel est le numéro CAS exact du 2,4-dinitrophé...",119-26-6,119-65-3
1,Quelle est la constante de Kaprekar pour les n...,6174,6174
2,En quelle année le théorème de Fermat-Wiles a-...,1995,1995
3,Quel est le code IATA de l'aéroport de Ouagado...,OUA,OUA
4,Combien de chromosomes possède le plant de pom...,48,48 chromosomes
5,Quelle est la masse molaire exacte du tétrahyd...,72.11,"72,105 g/mol"
6,Quel est le numéro atomique du rhénium?,75,75
7,En quelle année a été découvert le boson de Hi...,2012,2012
8,Quelle est la profondeur maximale de la fosse ...,10984,10 994 mètres
9,Quel est le point de fusion du tungstène en de...,3422,3422 degrés Celsius


In [5]:
import mlflow
import mlflow.openai
from mlflow.genai.scorers import Correctness


mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("llm-benchmark-evaluation")

mlflow.openai.autolog()


In [6]:
eval_df = pd.DataFrame({
    "inputs": df["question"].apply(
        lambda q: {"query": q}
    ),
    "expectations": df["answer"].apply(
        lambda q: {"expected_response" : q}
    )
})


In [7]:
with mlflow.start_run(run_name="gpt-4.1-nano-eval"):

    results = mlflow.genai.evaluate(
        predict_fn=llm_query,
        data=eval_df,
        scorers=[
            Correctness(),
        ],
    )

    mlflow.log_param("model_tested", "gpt-4.1-nano")
    mlflow.log_metric("num_samples", len(eval_df))


2025/12/16 15:58:36 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/12/16 15:58:36 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.
  from .autonotebook import tqdm as notebook_tqdm
Evaluating: 100%|██████████| 10/10 [Elapsed: 00:04, Remaining: 00:00] 
