In [0]:
%pip install -U -qqqq backoff databricks-openai uv databricks-agents mlflow==3.9.0rc0 dspy
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


#Pull the Traces with Feedback

We need the feedback you put in during the review app to align judges. 

First we will set up a list of traces with your feedback

In [0]:
import json
from pathlib import Path
import mlflow
from mlflow.genai.datasets import get_dataset

CONFIG = json.loads(Path("config/dc_assistant.json").read_text())

# Extract configuration variables
EXPERIMENT_ID = CONFIG["mlflow"]["experiment_id"]
DATASET_NAME = CONFIG["evaluation"]["dataset_name"]
JUDGE_MODEL = CONFIG["llm"]["judge_model"]
REFLECTION_MODEL = CONFIG["prompt_registry"]["reflection_model"]

# Set the MLflow experiment
mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/2517718719552044', creation_time=1768690316954, experiment_id='2517718719552044', last_update_time=1768803175855, lifecycle_stage='active', name=('/Users/austin.choi@databricks.com/GenAI/mlflow updates/AC updates '
 'dc-assistant-agent_experiment'), tags={'mlflow.databricks.managed_evals.experiment_permissions_check': '',
 'mlflow.databricks.review_app.experiment_permissions_check': '',
 'mlflow.experiment.sourceName': '/Users/austin.choi@databricks.com/GenAI/mlflow '
                                 'updates/AC updates '
                                 'dc-assistant-agent_experiment',
 'mlflow.experimentKind': 'genai_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.latestTraceEvaluationTimestampMs': '1768882940728',
 'mlflow.ownerEmail': 'austin.choi@databricks.com',
 'mlflow.ownerId': '3275534733162887',
 'product': 'mlflow',
 'purpose': 'football_analysis'}>

In [0]:
traces_for_alignment = mlflow.search_traces(
    experiment_ids=[EXPERIMENT_ID],
    # optionally keep your tag filter, but it’s not sufficient by itself
    filter_string="tag.eval = 'complete'",
    return_type="list",
    max_results=35,  # use 100+ if you can
)

valid_traces = []
for trace in traces_for_alignment:
    feedbacks = trace.search_assessments(name="football_analysis_base")
    has_judge = any(f.source.source_type == "LLM_JUDGE" for f in feedbacks)
    has_human = any(f.source.source_type == "HUMAN" for f in feedbacks)
    if has_judge and has_human:
        valid_traces.append(trace)

print("candidate:", len(traces_for_alignment), "valid:", len(valid_traces))


  traces_for_alignment = mlflow.search_traces(


candidate: 35 valid: 31


#Judge Alignment

In this tutorial, we will demonstrate two Judge Alignment optimizers: SIMBA and MemAlign. 

###MemAlign 
is a lightweight, dual-memory framework designed to align LLM judges with human experts by efficiently learning from a small amount of natural language feedback, offering a faster and cheaper alternative to traditional prompt engineering or fine-tuning. The system uses Semantic Memory for general principles and Episodic Memory for specific examples, allowing for rapid adaptation and showing visible improvement with as few as 2-10 examples. This approach delivers competitive or better quality than state-of-the-art prompt optimizers at up to 100x lower latency and 10x lower cost, and it is now the default optimization algorithm in MLflow's align() method in MLflow 3.9+

###SIMBA 
(Stochastic Introspective Mini-Batch Ascent) is a DSPy prompt-optimization method that iteratively improves an LLM’s prompts by evaluating changes on mini-batches with a target metric. It uses a stochastic hill-climbing loop that proposes prompt edits (including instruction rewrites and/or few-shot demonstrations) and keeps the variants that score better. Its “introspective” step leverages the LLM to analyze failures and generate corrective guidance, reducing manual prompt tuning for complex tasks. It was the default optimizer for mlflow.align() in MLflow 3.8 or below and can still be used today.

#Load Judge we want to Align

In [0]:
# Load the base judge from the evaluation notebook and define configuration parameters
from mlflow.genai.scorers import get_scorer

# SIMBA configuration parameters
LIKERT_MIN = 1.0  # Minimum Likert scale value
LIKERT_MAX = 5.0  # Maximum Likert scale value
SIMBA_BATCH_SIZE = 8  # Number of examples per SIMBA optimization step
SIMBA_MAX_DEMOS = 0  # Maximum few-shot demos (0 recommended for situations where exact matches on responses are not really possible)
SIMBA_VERBOSE = True  # Enable verbose logging for SIMBA optimization

# Load the base football analysis judge from the evaluation notebook
# Reference: https://mlflow.org/docs/latest/api_reference/python_api/mlflow.genai.html#mlflow.genai.scorers.get_scorer
judge_name = CONFIG.get("evaluation", {}).get("judge_name", "football_analysis_base")
football_analysis_judge = get_scorer(name=judge_name)

# Define the aligned judge name (can be customized in config if needed)
ALIGNED_JUDGE_NAME = CONFIG.get("evaluation", {}).get("aligned_judge_name", "football_analysis_judge_align")

print(f"Loaded base judge from evaluation notebook: {judge_name}")
print(f"Aligned judge name: {ALIGNED_JUDGE_NAME}")
print(f"SIMBA config: batch_size={SIMBA_BATCH_SIZE}, max_demos={SIMBA_MAX_DEMOS}, Likert range=[{LIKERT_MIN}, {LIKERT_MAX}]")


Loaded base judge from evaluation notebook: football_analysis_base
Aligned judge name: football_analysis_judge_align
SIMBA config: batch_size=8, max_demos=0, Likert range=[1.0, 5.0]


#SIMBA Implementation

Below is a basic Judge Alignment using the default SIMBA implementation from MLflow. 

#Regular SIMBA Optimization

In [0]:
import logging
from mlflow.genai.judges.optimizers import SIMBAAlignmentOptimizer
from statistics import mean
from typing import Any, Callable, List

from mlflow.genai.judges.base import AlignmentOptimizer, Judge
from mlflow.entities.trace import Trace

logging.getLogger("mlflow.genai.judges.optimizers.simba").setLevel(logging.DEBUG)

print(f'Initial Judge Text /n {football_analysis_judge.instructions}')

aligned_judge_basic = football_analysis_judge.align(
    traces=valid_traces,
    optimizer=SIMBAAlignmentOptimizer(model=REFLECTION_MODEL),
)

2026/01/19 03:37:56 INFO mlflow.genai.judges.optimizers.simba: Starting SIMBA optimization with 31 examples (set logging to DEBUG for detailed output)
2026/01/19 03:37:56 INFO dspy.teleprompt.simba: Starting batch 1 of 8.
2026/01/19 03:37:56 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.


Initial Judge Text /n Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage
  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:22,  1.40s/it]Processed 2 / 60 examples:   2%|▏  

2026/01/19 03:38:09 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.05

2026/01/19 03:38:09 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.5.
2026/01/19 03:38:09 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2026/01/19 03:38:09 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score True is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03:38:09 INFO dspy.teleprompt.simba: 

2026/01/19 03:38:09 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #2, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:38:09 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2026/01/19 03:38:09 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score False is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:22,  1.20s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:22,  1.20s/it]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<01:21,  1.20s/it]Processed 3 / 70 examples:   4%|▍         | 3/70 [00:01<00:23,  2.83it/s]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<00:23,  2.83it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:23,  2.83it/s]Processed 5 / 70 examples:   7%|▋         | 5/70 [00:01<00:16,  4.02it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:16,  4.02it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:15,  4.02it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:15,  4.02it/s]Processed 8 / 70 examples:  11%|█▏        | 8/70 [00:01<00:09,  6.52it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:09,  6.52it/s]Proce

2026/01/19 03:38:24 INFO dspy.teleprompt.simba: Scores after 1 batches: [0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0], Best: 0.1

2026/01/19 03:38:24 INFO dspy.teleprompt.simba: Starting batch 2 of 8.
2026/01/19 03:38:24 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:16,  1.29s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:16,  1.29s/it]Processed 2 / 60 examples:   3%|▎         | 2/60 [00:01<00:35,  1.65it/s]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<00:35,  1.65it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:34,  1.65it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:33,  1.65it/s]Processed 5 / 60 examples:   8%|▊         | 5/60 [00:01<00:11,  4.74it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:11,  4.74it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:11,  4.74it/s]Processed 7 / 60 examples:  12%|█▏        | 7/60 [00:01<00:08,  6.52it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:01<00:08,  6.52it/s]Processed 9 / 60 examples:  13%|█▎        | 8/60 [00:02<00:07,  6.52it/s]Proce

2026/01/19 03:38:36 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.2

2026/01/19 03:38:36 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:38:36 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_rule
2026/01/19 03:38:36 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score True is at or below the 10th percentile *or* bad score True is at or above the 90th percentile.
2026/01/19 03:38:36 INFO dspy.teleprompt.simba: 

2026/01/19 03:38:36 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #2, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:38:36 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_rule
2026/01/19 03:38:36 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score True is at or below the 10th percentile *or* bad score True is at or above the 90th percentile.
2026/01/19 03:38:


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:25,  1.24s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:25,  1.24s/it]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<01:24,  1.24s/it]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<01:23,  1.24s/it]Processed 4 / 70 examples:   6%|▌         | 4/70 [00:01<00:18,  3.59it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:18,  3.59it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:18,  3.59it/s]Processed 6 / 70 examples:   9%|▊         | 6/70 [00:01<00:13,  4.84it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:13,  4.84it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:13,  4.84it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:12,  4.84it/s]Processed 9 / 70 examples:  13%|█▎        | 9/70 [00:02<00:15,  3.86it/s]Proce

2026/01/19 03:38:50 INFO dspy.teleprompt.simba: Scores after 2 batches: [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], Best: 0.2

2026/01/19 03:38:50 INFO dspy.teleprompt.simba: Starting batch 3 of 8.
2026/01/19 03:38:50 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:05,  1.12s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:05,  1.12s/it]Processed 2 / 60 examples:   3%|▎         | 2/60 [00:01<00:32,  1.76it/s]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<00:32,  1.76it/s]Processed 3 / 60 examples:   5%|▌         | 3/60 [00:01<00:21,  2.69it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:21,  2.69it/s]Processed 4 / 60 examples:   7%|▋         | 4/60 [00:01<00:16,  3.32it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:16,  3.32it/s]Processed 5 / 60 examples:   8%|▊         | 5/60 [00:01<00:15,  3.49it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:15,  3.49it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:02<00:15,  3.49it/s]Processed 7 / 60 examples:  12%|█▏        | 7/60 [00:02<00:10,  5.14it/s]Proce

2026/01/19 03:39:03 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.0

2026/01/19 03:39:03 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:39:03 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule
2026/01/19 03:39:03 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score False is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03:39:03 INFO dspy.teleprompt.simba: 

2026/01/19 03:39:03 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #2, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:39:03 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule
2026/01/19 03:39:03 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score False is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:29,  1.29s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:29,  1.29s/it]Processed 2 / 70 examples:   3%|▎         | 2/70 [00:01<00:40,  1.67it/s]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<00:40,  1.67it/s]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<00:40,  1.67it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:39,  1.67it/s]Processed 5 / 70 examples:   7%|▋         | 5/70 [00:01<00:13,  4.96it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:13,  4.96it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:12,  4.96it/s]Processed 7 / 70 examples:  10%|█         | 7/70 [00:01<00:09,  6.84it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:09,  6.84it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:09,  6.84it/s]Proce

2026/01/19 03:39:17 INFO dspy.teleprompt.simba: Scores after 3 batches: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], Best: 0.0

2026/01/19 03:39:17 INFO dspy.teleprompt.simba: Starting batch 4 of 8.
2026/01/19 03:39:17 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:12,  1.22s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:12,  1.22s/it]Processed 2 / 60 examples:   3%|▎         | 2/60 [00:01<00:33,  1.73it/s]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<00:33,  1.73it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:32,  1.73it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:32,  1.73it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:31,  1.73it/s]Processed 6 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.17it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.17it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:01<00:10,  5.17it/s]Processed 9 / 60 examples:  13%|█▎        | 8/60 [00:02<00:10,  5.17it/s]Processed 9 / 60 examples:  15%|█▌        | 9/60 [00:02<00:13,  3.78it/s]Proce

2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Baseline mini-batch score: 0.1

2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #1, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_demo_
2026/01/19 03:39:30 INFO dspy.teleprompt.simba_utils: Added 0 demos (one each) across all predictors.
2026/01/19 03:39:30 INFO dspy.teleprompt.simba: 

2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #2, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_demo_
2026/01/19 03:39:30 INFO dspy.teleprompt.simba_utils: Skipping appending a demo as good score False is at or below the 10th percentile.
2026/01/19 03:39:30 INFO dspy.teleprompt.simba: 

2026/01/19 03:39:30 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #3, with max score 0.0, max-to


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:24,  1.23s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:24,  1.23s/it]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<01:23,  1.23s/it]Processed 3 / 70 examples:   4%|▍         | 3/70 [00:01<00:26,  2.52it/s]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<00:26,  2.52it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:26,  2.52it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:25,  2.52it/s]Processed 6 / 70 examples:   9%|▊         | 6/70 [00:01<00:12,  5.04it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:12,  5.04it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:12,  5.04it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:12,  5.04it/s]Processed 9 / 70 examples:  13%|█▎        | 9/70 [00:02<00:15,  4.05it/s]Proce

2026/01/19 03:39:44 INFO dspy.teleprompt.simba: Scores after 4 batches: [0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], Best: 0.1

2026/01/19 03:39:44 INFO dspy.teleprompt.simba: Starting batch 5 of 8.
2026/01/19 03:39:44 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:05,  1.12s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:05,  1.12s/it]Processed 2 / 60 examples:   3%|▎         | 2/60 [00:01<00:35,  1.63it/s]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<00:35,  1.63it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:35,  1.63it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:34,  1.63it/s]Processed 5 / 60 examples:   8%|▊         | 5/60 [00:01<00:12,  4.40it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:12,  4.40it/s]Processed 6 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.06it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.06it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:01<00:10,  5.06it/s]Processed 8 / 60 examples:  13%|█▎        | 8/60 [00:01<00:07,  6.81it/s]Proce

2026/01/19 03:39:57 INFO dspy.teleprompt.simba: Batch 5: Baseline mini-batch score: 0.06666666666666667

2026/01/19 03:39:57 INFO dspy.teleprompt.simba: Batch 5: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.33333333333333337.
2026/01/19 03:39:57 INFO dspy.teleprompt.simba: Batch 5: Invoking strategy: append_a_rule
2026/01/19 03:39:57 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score True is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03:39:57 INFO dspy.teleprompt.simba: 

2026/01/19 03:39:57 INFO dspy.teleprompt.simba: Batch 5: Processing bucket #2, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:39:57 INFO dspy.teleprompt.simba: Batch 5: Invoking strategy: append_a_rule
2026/01/19 03:39:57 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score False is at or below the 10th percentile *or* bad score False is at or above the


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:15,  1.09s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:15,  1.09s/it]Processed 2 / 70 examples:   3%|▎         | 2/70 [00:01<00:36,  1.88it/s]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<00:36,  1.88it/s]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<00:35,  1.88it/s]Processed 4 / 70 examples:   6%|▌         | 4/70 [00:01<00:17,  3.85it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:17,  3.85it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:16,  3.85it/s]Processed 6 / 70 examples:   9%|▊         | 6/70 [00:01<00:10,  6.13it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:10,  6.13it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:10,  6.13it/s]Processed 8 / 70 examples:  11%|█▏        | 8/70 [00:01<00:08,  7.29it/s]Proce

2026/01/19 03:40:10 INFO dspy.teleprompt.simba: Scores after 5 batches: [0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0], Best: 0.1

2026/01/19 03:40:10 INFO dspy.teleprompt.simba: Starting batch 6 of 8.
2026/01/19 03:40:10 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:09,  1.18s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:09,  1.18s/it]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<01:08,  1.18s/it]Processed 3 / 60 examples:   5%|▌         | 3/60 [00:01<00:22,  2.56it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:22,  2.56it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:21,  2.56it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:21,  2.56it/s]Processed 6 / 60 examples:  10%|█         | 6/60 [00:01<00:12,  4.40it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:12,  4.40it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:01<00:12,  4.40it/s]Processed 8 / 60 examples:  13%|█▎        | 8/60 [00:01<00:08,  6.09it/s]Processed 9 / 60 examples:  13%|█▎        | 8/60 [00:02<00:08,  6.09it/s]Proce

2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Batch 6: Baseline mini-batch score: 0.1

2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Batch 6: Processing bucket #1, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Batch 6: Invoking strategy: append_a_demo_
2026/01/19 03:40:23 INFO dspy.teleprompt.simba_utils: Added 0 demos (one each) across all predictors.
2026/01/19 03:40:23 INFO dspy.teleprompt.simba: 

2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Batch 6: Processing bucket #2, with max score 0.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Batch 6: Invoking strategy: append_a_rule
2026/01/19 03:40:23 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score False is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03:40:23 INFO dspy.teleprompt.simba: 

2026/01/19 03:40:23 INFO dspy.teleprompt.simba: Bat


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:29,  1.30s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:29,  1.30s/it]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<01:28,  1.30s/it]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<01:27,  1.30s/it]Processed 4 / 70 examples:   6%|▌         | 4/70 [00:01<00:19,  3.32it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:19,  3.32it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:19,  3.32it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:19,  3.32it/s]Processed 7 / 70 examples:  10%|█         | 7/70 [00:01<00:11,  5.40it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:01<00:11,  5.40it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:11,  5.40it/s]Processed 9 / 70 examples:  13%|█▎        | 9/70 [00:02<00:16,  3.60it/s]Proce

2026/01/19 03:40:37 INFO dspy.teleprompt.simba: Scores after 6 batches: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], Best: 0.1

2026/01/19 03:40:37 INFO dspy.teleprompt.simba: Starting batch 7 of 8.
2026/01/19 03:40:37 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:16,  1.29s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:16,  1.29s/it]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<01:14,  1.29s/it]Processed 3 / 60 examples:   5%|▌         | 3/60 [00:01<00:22,  2.56it/s]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<00:22,  2.56it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:21,  2.56it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:21,  2.56it/s]Processed 6 / 60 examples:  10%|█         | 6/60 [00:01<00:09,  5.72it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:09,  5.72it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:02<00:09,  5.72it/s]Processed 8 / 60 examples:  13%|█▎        | 8/60 [00:02<00:10,  5.09it/s]Processed 9 / 60 examples:  13%|█▎        | 8/60 [00:02<00:10,  5.09it/s]Proce

2026/01/19 03:40:49 INFO dspy.teleprompt.simba: Batch 7: Baseline mini-batch score: 0.2

2026/01/19 03:40:49 INFO dspy.teleprompt.simba: Batch 7: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2026/01/19 03:40:49 INFO dspy.teleprompt.simba: Batch 7: Invoking strategy: append_a_rule





2026/01/19 03:41:02 INFO dspy.teleprompt.simba_utils: Advice for self: If the user asks for a scoped subset (e.g., “red zone plays”, “inside the 20”, “3rd down only”, “after turnovers only”) and the response explicitly admits the retrieved data is NOT restricted to that scope (e.g., says it’s all-field/location-agnostic, or asks to re-run with a filter), then you should NOT award a perfect 5. In that pattern, grade as 3–4 depending on usefulness: give a 4 when the response still provides strong relevant tendencies + clearly flags the limitation + proposes a concrete next step to get the correct filtered data; give a 3 if it mostly defers/hand-waves without substantive interim insight.

If the response covers multiple subquestions and one subquestion is answered well but another is only partially answered (or answered with mismatched data), then overall score should be constrained by the weakest subanswer. Do not let strong analysis on part A justify a 5 when part B is incomplete.

Rese

  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:28,  1.28s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:28,  1.28s/it]Processed 2 / 70 examples:   3%|▎         | 2/70 [00:01<00:41,  1.63it/s]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<00:41,  1.63it/s]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<00:41,  1.63it/s]Processed 4 / 70 examples:   6%|▌         | 4/70 [00:01<00:19,  3.43it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:19,  3.43it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:18,  3.43it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:18,  3.43it/s]Processed 7 / 70 examples:  10%|█         | 7/70 [00:01<00:10,  5.78it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:02<00:10,  5.78it/s]Processed 8 / 70 examples:  11%|█▏        | 8/70 [00:02<00:21,  2.94it/s]Proces

2026/01/19 03:41:27 INFO dspy.teleprompt.simba: Scores after 7 batches: [0.7, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3], Best: 0.7

2026/01/19 03:41:27 INFO dspy.teleprompt.simba: Starting batch 8 of 8.
2026/01/19 03:41:27 INFO dspy.teleprompt.simba: Sampling program trajectories on 10 examples x 6 samples.



  0%|          | 0/60 [00:00<?, ?it/s]Processed 1 / 60 examples:   0%|          | 0/60 [00:01<?, ?it/s]Processed 1 / 60 examples:   2%|▏         | 1/60 [00:01<01:20,  1.36s/it]Processed 2 / 60 examples:   2%|▏         | 1/60 [00:01<01:20,  1.36s/it]Processed 3 / 60 examples:   3%|▎         | 2/60 [00:01<01:18,  1.36s/it]Processed 4 / 60 examples:   5%|▌         | 3/60 [00:01<01:17,  1.36s/it]Processed 4 / 60 examples:   7%|▋         | 4/60 [00:01<00:16,  3.41it/s]Processed 5 / 60 examples:   7%|▋         | 4/60 [00:01<00:16,  3.41it/s]Processed 6 / 60 examples:   8%|▊         | 5/60 [00:01<00:16,  3.41it/s]Processed 6 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.18it/s]Processed 7 / 60 examples:  10%|█         | 6/60 [00:01<00:10,  5.18it/s]Processed 8 / 60 examples:  12%|█▏        | 7/60 [00:02<00:10,  5.18it/s]Processed 8 / 60 examples:  13%|█▎        | 8/60 [00:02<00:11,  4.67it/s]Processed 9 / 60 examples:  13%|█▎        | 8/60 [00:03<00:11,  4.67it/s]Proce

2026/01/19 03:41:41 INFO dspy.teleprompt.simba: Batch 8: Baseline mini-batch score: 0.05

2026/01/19 03:41:41 INFO dspy.teleprompt.simba: Batch 8: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2026/01/19 03:41:41 INFO dspy.teleprompt.simba: Batch 8: Invoking strategy: append_a_rule
2026/01/19 03:41:41 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score True is at or below the 10th percentile *or* bad score False is at or above the 90th percentile.
2026/01/19 03:41:41 INFO dspy.teleprompt.simba: 

2026/01/19 03:41:41 INFO dspy.teleprompt.simba: Batch 8: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2026/01/19 03:41:41 INFO dspy.teleprompt.simba: Batch 8: Invoking strategy: append_a_demo_
2026/01/19 03:41:41 INFO dspy.teleprompt.simba_utils: Added 0 demos (one each) across all predictors.
2026/01/19 03:41:41 INFO dspy.teleprompt.simba: 

2026/01/19 03:41:41 I


  0%|          | 0/70 [00:00<?, ?it/s]Processed 1 / 70 examples:   0%|          | 0/70 [00:01<?, ?it/s]Processed 1 / 70 examples:   1%|▏         | 1/70 [00:01<01:48,  1.57s/it]Processed 2 / 70 examples:   1%|▏         | 1/70 [00:01<01:48,  1.57s/it]Processed 3 / 70 examples:   3%|▎         | 2/70 [00:01<01:47,  1.57s/it]Processed 4 / 70 examples:   4%|▍         | 3/70 [00:01<01:45,  1.57s/it]Processed 4 / 70 examples:   6%|▌         | 4/70 [00:01<00:23,  2.86it/s]Processed 5 / 70 examples:   6%|▌         | 4/70 [00:01<00:23,  2.86it/s]Processed 6 / 70 examples:   7%|▋         | 5/70 [00:01<00:22,  2.86it/s]Processed 6 / 70 examples:   9%|▊         | 6/70 [00:01<00:14,  4.52it/s]Processed 7 / 70 examples:   9%|▊         | 6/70 [00:01<00:14,  4.52it/s]Processed 8 / 70 examples:  10%|█         | 7/70 [00:02<00:13,  4.52it/s]Processed 8 / 70 examples:  11%|█▏        | 8/70 [00:02<00:11,  5.24it/s]Processed 9 / 70 examples:  11%|█▏        | 8/70 [00:02<00:11,  5.24it/s]Proce

2026/01/19 03:41:56 INFO dspy.teleprompt.simba: Scores after 8 batches: [0.1, 0.0, 0.1, 0.1, 0.2, 0.1, 0.2], Best: 0.2

2026/01/19 03:41:56 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 7 programs on the full trainset.



  0%|          | 0/217 [00:00<?, ?it/s]Processed 1 / 217 examples:   0%|          | 0/217 [00:01<?, ?it/s]Processed 1 / 217 examples:   0%|          | 1/217 [00:01<04:01,  1.12s/it]Processed 2 / 217 examples:   0%|          | 1/217 [00:01<04:01,  1.12s/it]Processed 2 / 217 examples:   1%|          | 2/217 [00:01<01:52,  1.91it/s]Processed 3 / 217 examples:   1%|          | 2/217 [00:01<01:52,  1.91it/s]Processed 4 / 217 examples:   1%|▏         | 3/217 [00:01<01:52,  1.91it/s]Processed 4 / 217 examples:   2%|▏         | 4/217 [00:01<00:51,  4.14it/s]Processed 5 / 217 examples:   2%|▏         | 4/217 [00:01<00:51,  4.14it/s]Processed 5 / 217 examples:   2%|▏         | 5/217 [00:01<00:44,  4.74it/s]Processed 6 / 217 examples:   2%|▏         | 5/217 [00:01<00:44,  4.74it/s]Processed 7 / 217 examples:   3%|▎         | 6/217 [00:01<00:44,  4.74it/s]Processed 8 / 217 examples:   3%|▎         | 7/217 [00:01<00:44,  4.74it/s]Processed 8 / 217 examples:   4%|▎         | 8/217 [00

2026/01/19 03:42:40 INFO dspy.teleprompt.simba: Final trainset scores: [0.0967741935483871, 0.03225806451612903, 0.0967741935483871, 0.06451612903225806, 0.0967741935483871, 0.3548387096774194, 0.41935483870967744], Best: 0.41935483870967744 (at index 6)



2026/01/19 03:42:40 INFO mlflow.genai.judges.optimizers.simba: SIMBA optimization completed





In [0]:
print("Original instructions:\n", football_analysis_judge.instructions)
print("\nAligned instructions:\n", aligned_judge_basic.instructions)

Original instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage

Aligned instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, 

In [0]:
from mlflow.genai.judges import make_judge
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    ScorerSamplingConfig,
    get_scorer
)

register_align_judge_basic = make_judge(
    name=f"{ALIGNED_JUDGE_NAME}_basic",
    instructions=aligned_judge_basic.instructions,
    feedback_value_type=float,
    # model=JUDGE_MODEL,  # Model used to evaluate (from config)
)

try:
    register_aligned_judge_basic = register_align_judge_basic.register(experiment_id=EXPERIMENT_ID)

except ValueError as e:
    msg = str(e)

    if "has already been registered" in msg:
        # Preferred path per the error message: update existing scorer
        register_aligned_judge = register_align_judge_basic.update(
            experiment_id=EXPERIMENT_ID,
            sampling_config=ScorerSamplingConfig(sample_rate=1)
        )
    else:
        raise

print("Registered aligned judge", register_aligned_judge_basic.name)

Registered aligned judge football_analysis_judge_align_basic


###Rerun with SIMBA Optimized Judge

In [0]:
from agent import AGENT
from mlflow.genai import evaluate
from mlflow.genai.datasets import create_dataset, get_dataset
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    get_scorer,
)

# Compile the judges and rerun the evaluation job

football_language = "The response must use language that is appropriate for professional football players and coaches"
football_language_judge = Guidelines(name="football_language", guidelines=football_language)

scorers = [RelevanceToQuery(), football_language_judge, register_aligned_judge_basic]
# Rerun the evaluation after recreating the judge to better calibrate the agent quality

mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

# Grab all traces from the original eval dataset
eval_dataset = get_dataset(name=DATASET_NAME)

def extract_question(row_input):
    try:
      messages = row_input['request']['input']
      last_message = messages[-1]
      return last_message['content']
    except Exception as e:
      print(e)

df = eval_dataset.to_df()
eval_dataset_records = [
    {
        "inputs": {
            "input": [
                # Pass the EXTRACTED string, not the whole dictionary object
                {"role": "user", "content": extract_question(row)}
            ]
        }
        # Note: "expected" field is optional
    }
    for row in df['inputs'] 
]

print("Executing Evaluation Job")
results = evaluate(
    data=eval_dataset_records,
    predict_fn=lambda input: AGENT.predict({"input": input}),
    scorers=scorers
)

2026/01/19 04:44:01 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.


Executing Evaluation Job


Evaluating:   0%|          | 0/32 [Elapsed: 00:00, Remaining: ?] 



#MemAlign Optimizer 

Mlflow's default optimizer

In [0]:
import logging
import os
import dspy
from mlflow.genai.judges.optimizers import MemAlignOptimizer
from statistics import mean
from typing import Any, Callable, List

from mlflow.genai.judges.base import AlignmentOptimizer, Judge
from mlflow.entities.trace import Trace

os.environ["OPENAI_API_KEY"] = ""
dspy.configure(cache=False) 

print(f'Initial Judge Text /n {football_analysis_judge.instructions}')

aligned_judge_memalign = football_analysis_judge.align(
    traces=valid_traces,
    optimizer=MemAlignOptimizer(reflection_lm="databricks:/databricks-gpt-5-2", embedding_model="openai/text-embedding-3-large",) #Databricks not supported right now
)

Initial Judge Text /n Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage


In [0]:
print("Original instructions:\n", football_analysis_judge.instructions)
print("\nAligned instructions:\n", aligned_judge_memalign.instructions)

Original instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage

Aligned instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, 

In [0]:
from mlflow.genai.judges import make_judge
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    ScorerSamplingConfig,
    get_scorer
)

register_align_judge_memalign = make_judge(
    name=f"{ALIGNED_JUDGE_NAME}_memalign",
    instructions=aligned_judge_memalign.instructions,
    feedback_value_type=float,
    # model=JUDGE_MODEL,  # Model used to evaluate (from config)
)

try:
    register_aligned_judge_memalign = register_align_judge_memalign.register(experiment_id=EXPERIMENT_ID)
    print("Registered aligned judge", register_aligned_judge_memalign.name)

except ValueError as e:
    msg = str(e)

    if "has already been registered" in msg:
        # Preferred path per the error message: update existing scorer
        register_aligned_judge = register_align_judge_memalign.update(
            experiment_id=EXPERIMENT_ID,
            sampling_config=ScorerSamplingConfig(sample_rate=1)
        )
    else:
        raise



###Run the MemAlign Aligned Judge

In [0]:
from agent import AGENT
from mlflow.genai import evaluate
from mlflow.genai.datasets import create_dataset, get_dataset
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    get_scorer,
)

# Compile the judges and rerun the evaluation job

football_language = "The response must use language that is appropriate for professional football players and coaches"
football_language_judge = Guidelines(name="football_language", guidelines=football_language)

scorers = [RelevanceToQuery(), football_language_judge, register_align_judge_memalign]
# Rerun the evaluation after recreating the judge to better calibrate the agent quality

mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

# Grab all traces from the original eval dataset
eval_dataset = get_dataset(name=DATASET_NAME)

def extract_question(row_input):
    try:
      messages = row_input['request']['input']
      last_message = messages[-1]
      return last_message['content']
    except Exception as e:
      print(e)

df = eval_dataset.to_df()
eval_dataset_records = [
    {
        "inputs": {
            "input": [
                # Pass the EXTRACTED string, not the whole dictionary object
                {"role": "user", "content": extract_question(row)}
            ]
        }
        # Note: "expected" field is optional
    }
    for row in df['inputs'] 
]

print("Executing Evaluation Job")
results = evaluate(
    data=eval_dataset_records,
    predict_fn=lambda input: AGENT.predict({"input": input}),
    scorers=scorers
)

  return orig_warn(*args, **kwargs)
2026/01/19 06:12:49 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2026/01/19 06:12:49 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.


Executing Evaluation Job


Evaluating:   0%|          | 0/32 [Elapsed: 00:00, Remaining: ?] 



#Likert SIMBA Optimizer

MLflow allows you to build or create a new optimizer using `AlignmentOptimizer`. To demonstrate this, we have created a Likert-aware SIMBA optimizer below to improve its optimization 

In [0]:
# Likert-aware SIMBA optimizer: single-cell implementation

import logging
from statistics import mean
from typing import Any, Callable, List

from mlflow.genai.judges.base import AlignmentOptimizer, Judge
from mlflow.entities.trace import Trace
from mlflow.genai.judges.optimizers import SIMBAAlignmentOptimizer as _BaseSIMBA

# Use configuration parameters for Likert scale
def _to_float_maybe(x: Any) -> float | None:
    try:
        return float(x)
    except Exception:
        return None

def likert_agreement_metric(example: Any, prediction: Any) -> float:
    """
    Likert agreement metric:
        score = 1 - |llm - human| / (LIKERT_MAX - LIKERT_MIN)

    Reads from:
      - Human label: example._store["result"]
      - LLM/judge score: prediction._store["result"]
    """
    logger = logging.getLogger("dspy.teleprompt.simba")

    human = None
    llm = None

    # Primary: read from example._store / prediction._store
    ex_store = getattr(example, "_store", None)
    if isinstance(ex_store, dict) and "result" in ex_store:
        human = _to_float_maybe(ex_store["result"])

    pred_store = getattr(prediction, "_store", None)
    if isinstance(pred_store, dict) and "result" in pred_store:
        llm = _to_float_maybe(pred_store["result"])

    # Fallbacks
    if human is None:
        for key in ("human_score", "human_value", "label", "target", "score", "y"):
            if hasattr(example, key):
                human = _to_float_maybe(getattr(example, key))
                if human is not None:
                    break
            if isinstance(example, dict) and key in example:
                human = _to_float_maybe(example[key])
                if human is not None:
                    break

    if llm is None:
        if isinstance(prediction, dict):
            for k in ("llm_score", "value", "score", "rating", "label", "y_hat"):
                if k in prediction:
                    llm = _to_float_maybe(prediction[k])
                    if llm is not None:
                        break
        if llm is None:
            llm = _to_float_maybe(prediction)

    if human is None or llm is None:
        logger.info(
            "LIKERT: missing scores (human=%r, llm=%r) -> 0.0",
            human,
            llm,
        )
        return 0.0

    # Clamp to configured Likert range
    human = max(LIKERT_MIN, min(LIKERT_MAX, human))
    llm = max(LIKERT_MIN, min(LIKERT_MAX, llm))

    score = max(0.0, 1.0 - abs(llm - human) / (LIKERT_MAX - LIKERT_MIN))
    return score


class LikertSIMBAAlignmentOptimizer(AlignmentOptimizer):
    """Unified optimizer: injects Likert metric, batch size, max_demos, and optional verbose logging.

    Uses configuration parameters from the config cell above.
    """

    def __init__(
        self,
        model: str,
        batch_size: int = 10,
        max_demos: int = 0,
        metric_fn: Callable[[Any, Any], float] = None,
        verbose: bool = False,
    ):
        self.model = model
        self.batch_size = batch_size
        self.max_demos = max_demos
        self.metric_fn = metric_fn
        self.verbose = verbose

    # ---- Internal helpers for verbose logging ----
    class _BatchScoreAggregator:
        def __init__(self):
            self.all_batches: List[List[float]] = []
            self.current: List[float] = []
            self.batch_idx: int = 0

        def start_batch(self):
            if self.current:
                self._log_current_summary()
                self.all_batches.append(self.current)
            self.current = []
            self.batch_idx += 1

        def add(self, score: float):
            if isinstance(score, (int, float)):
                self.current.append(float(score))

        def end(self):
            if self.current:
                self._log_current_summary()
                self.all_batches.append(self.current)
                self.current = []
            all_flat = [s for batch in self.all_batches for s in batch]
            if all_flat:
                best = max(all_flat)
                batches_n = len(self.all_batches)
                logging.getLogger("dspy.teleprompt.simba").info(
                    "Scores after %d batches: %s, Best: %s",
                    batches_n,
                    [round(mean(b), 3) if b else 0.0 for b in self.all_batches],
                    round(best, 3),
                )

        def _log_current_summary(self):
            lg = logging.getLogger("dspy.teleprompt.simba")
            if not self.current:
                return
            mx = max(self.current)
            mn = min(self.current)
            avg = mean(self.current)
            lg.info(
                "Processing bucket #%d, with max score %s, max-to-min gap %s, and max-to-avg gap %s.",
                self.batch_idx if self.batch_idx else 1,
                round(mx, 3),
                round(mx - mn, 3),
                round(mx - avg, 3),
            )

    class _SIMBABatchLogHandler(logging.Handler):
        def __init__(self, aggregator: "LikertSIMBAAlignmentOptimizer._BatchScoreAggregator"):
            super().__init__()
            self.aggregator = aggregator

        def emit(self, record: logging.LogRecord):
            msg = record.getMessage()
            if "Starting batch" in msg and "of" in msg:
                self.aggregator.start_batch()

    def _wrap_metric_for_logging(self, metric_fn: Callable[[Any, Any], float]):
        aggregator = self._BatchScoreAggregator()

        def logged_metric(example, prediction):  
            score = metric_fn(example, prediction)
            aggregator.add(score)
            return score

        batch_handler = self._SIMBABatchLogHandler(aggregator)
        simba_logger = logging.getLogger("dspy.teleprompt.simba")
        simba_utils_logger = logging.getLogger("dspy.teleprompt.simba_utils")
        simba_logger.setLevel(logging.INFO)
        simba_utils_logger.setLevel(logging.INFO)
        if all(not isinstance(h, LikertSIMBAAlignmentOptimizer._SIMBABatchLogHandler) for h in simba_logger.handlers):
            simba_logger.addHandler(batch_handler)
        return logged_metric, aggregator, simba_logger, batch_handler

    def align(self, judge: Judge, traces: list[Trace]) -> Judge:
        import dspy.teleprompt.simba as dsimba

        # Choose metric function
        metric_fn = self.metric_fn if self.metric_fn is not None else likert_agreement_metric
        logging.getLogger("dspy.teleprompt.simba").info(
            "Using SIMBA metric_fn=%s",
            getattr(metric_fn, "__name__", repr(metric_fn)),
        )
        
        # Optionally wrap metric for verbose logging
        aggregator = None
        simba_logger = None
        batch_handler = None
        if self.verbose:
            metric_fn, aggregator, simba_logger, batch_handler = self._wrap_metric_for_logging(metric_fn)

        # Patch DSPy SIMBA init to inject our parameters
        original_init = dsimba.SIMBA.__init__
        batch_size = self.batch_size
        max_demos = self.max_demos

        def patched_init(self_, *args, **kwargs): 
            # Force our settings
            logging.getLogger("dspy.teleprompt.simba").info(
                "Patched SIMBA.__init__: forcing metric_fn=%s, bsize=%s, max_demos=%s",
                getattr(metric_fn, "__name__", repr(metric_fn)),
                batch_size,
                max_demos,
            )

            kwargs["metric"] = metric_fn
            kwargs["bsize"] = batch_size
            kwargs["max_demos"] = max_demos

            return original_init(self_, *args, **kwargs)

        dsimba.SIMBA.__init__ = patched_init
        try:
            base = _BaseSIMBA(model=self.model)
            result = base.align(judge=judge, traces=traces)
        finally:
            dsimba.SIMBA.__init__ = original_init
            if aggregator is not None:
                aggregator.end()
            if simba_logger is not None and batch_handler is not None:
                try:
                    simba_logger.removeHandler(batch_handler)
                except Exception:
                    pass
        return result

print("Likert SIMBA optimizer loaded successfully")


Likert SIMBA optimizer loaded successfully


###Run Optimization with LikertSIMBAOptimizer

This will take a few minutes

In [0]:
logging.getLogger("mlflow.genai.judges.optimizers.simba").setLevel(logging.DEBUG)

print(f'Initial Judge Text /n {football_analysis_judge.instructions}')

likert_optimizer = LikertSIMBAAlignmentOptimizer(
    model=REFLECTION_MODEL,
    batch_size=6,
    max_demos=0,
    verbose=True
)

aligned_judge = football_analysis_judge.align(
    traces=valid_traces,
    optimizer=likert_optimizer,
)

2026/01/19 03:52:34 INFO dspy.teleprompt.simba: Using SIMBA metric_fn=likert_agreement_metric
2026/01/19 03:52:34 INFO dspy.teleprompt.simba: Patched SIMBA.__init__: forcing metric_fn=logged_metric, bsize=6, max_demos=0
2026/01/19 03:52:34 INFO mlflow.genai.judges.optimizers.simba: Starting SIMBA optimization with 31 examples (set logging to DEBUG for detailed output)
2026/01/19 03:52:34 INFO dspy.teleprompt.simba: Starting batch 1 of 8.
2026/01/19 03:52:34 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.


Initial Judge Text /n Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage
  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:47,  1.36s/it]Processed 2 / 36 examples:   3%|▎  

2026/01/19 03:52:42 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.4583333333333333

2026/01/19 03:52:42 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 0.75, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:52:42 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2026/01/19 03:52:42 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score 0.75 is at or below the 10th percentile *or* bad score 0.75 is at or above the 90th percentile.
2026/01/19 03:52:42 INFO dspy.teleprompt.simba: 

2026/01/19 03:52:42 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #2, with max score 0.5, max-to-min gap 0.0, and max-to-avg gap 0.0.
2026/01/19 03:52:42 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule





2026/01/19 03:52:55 INFO dspy.teleprompt.simba_utils: Advice for self: If the evaluated response summarizes tendencies (e.g., shotgun pass most common) but does NOT (1) cite or clearly point to specific supporting evidence from the provided data (key plays, counts, sample sizes, notable examples/outliers) and (2) translate that evidence into concrete, situationally deployable coaching decisions (e.g., specific pressure/Cover X calls vs specific formations/personnel, down-and-distance/game-state qualifiers), then you should cap the score at ~3–4, not 5.

If the response contains small-sample caveats yet still makes strong claims without anchoring them (no references to which plays drove EPA, which specific play was the “best outlier,” no table/list of the most important plays), then treat it as ‘somewhat acceptable’ (3) unless it compensates by explicitly highlighting the top key plays and why they matter.

If the user is a coach/strategist and the response provides only generic defensi

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:37,  1.07s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:37,  1.07s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:24,  1.36it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:24,  1.36it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:24,  1.36it/s]Processed 4 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  2.92it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  2.92it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:10,  2.92it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:10,  2.92it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.13it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.13it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:02<00:05,  4.77it/s]Proces

2026/01/19 03:53:42 INFO dspy.teleprompt.simba: Scores after 1 batches: [0.4583333333333333, 0.5833333333333334, 0.7916666666666666, 0.7083333333333334, 0.5, 0.4583333333333333], Best: 0.7916666666666666

2026/01/19 03:53:42 INFO dspy.teleprompt.simba: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.479.
2026/01/19 03:53:42 INFO dspy.teleprompt.simba: Starting batch 2 of 8.
2026/01/19 03:53:42 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:43,  1.24s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:43,  1.24s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:21,  1.58it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:21,  1.58it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:20,  1.58it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:20,  1.58it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:19,  1.58it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:05,  5.01it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:01<00:05,  5.01it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.01it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:02<00:07,  3.82it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:02<00:07,  3.82it/s]Proce

2026/01/19 03:53:51 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.7708333333333334

2026/01/19 03:53:51 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 0.5, and max-to-avg gap 0.33333333333333337.
2026/01/19 03:53:51 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_rule





2026/01/19 03:54:05 INFO dspy.teleprompt.simba_utils: Advice for self: If you receive an evaluation task where the user asks a descriptive/analytic question (e.g., “How does X use motion?”) and the response provides relevant breakdowns + some coaching implications, then do NOT require extra deliverables that the user didn’t ask for (e.g., a step-by-step play sequence) unless the rubric explicitly demands it. Instead, score primarily on the rubric axes: (1) accuracy, (2) contextual relevance, (3) strategic advantage/actionability.

If the response includes quantitative claims (counts, rates, EPA) but does NOT clearly tie them to evidence in the provided data (e.g., sample sizes by grouping are asserted without pointing to where they came from; no key plays/examples/outliers are cited), then cap the score at 3–4 (not 5) and explicitly state what evidence is missing (e.g., “cite top 3 motion concepts/plays, list example snaps, or show a small table by personnel/down-distance”).

If the re

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:59,  1.69s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:59,  1.69s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:29,  1.15it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:29,  1.15it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:28,  1.15it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:02<00:27,  1.15it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:02<00:26,  1.15it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:26,  1.15it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.21it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.21it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:03<00:05,  5.21it/s]Processed 9 / 36 examples:  25%|██▌       | 9/36 [00:03<00:09,  2.90it/s]Proces

2026/01/19 03:55:37 INFO dspy.teleprompt.simba: Scores after 2 batches: [0.75, 0.8333333333333334, 0.7916666666666666, 0.7916666666666666, 0.6666666666666666, 0.6666666666666666], Best: 0.8333333333333334

2026/01/19 03:55:37 INFO dspy.teleprompt.simba: Processing bucket #2, with max score 1.0, max-to-min gap 0.75, and max-to-avg gap 0.24.
2026/01/19 03:55:37 INFO dspy.teleprompt.simba: Starting batch 3 of 8.
2026/01/19 03:55:37 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:00<00:33,  1.04it/s]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:33,  1.04it/s]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:21,  1.60it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:21,  1.60it/s]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:13,  2.38it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:13,  2.38it/s]Processed 4 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.08it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.08it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:10,  3.08it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:06,  4.49it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:06,  4.49it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.15it/s]Proce

2026/01/19 03:55:42 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 20 / 36 examples:  53%|█████▎    | 19/36 [00:05<00:04,  3.50it/s]Processed 20 / 36 examples:  56%|█████▌    | 20/36 [00:05<00:03,  4.52it/s]Processed 21 / 36 examples:  56%|█████▌    | 20/36 [00:05<00:03,  4.52it/s]Processed 21 / 36 examples:  58%|█████▊    | 21/36 [00:05<00:03,  4.21it/s]Processed 22 / 36 examples:  58%|█████▊    | 21/36 [00:05<00:03,  4.21it/s]Processed 22 / 36 examples:  61%|██████    | 22/36 [00:05<00:03,  4.65it/s]Processed 23 / 36 examples:  61%|██████    | 22/36 [00:05<00:03,  4.65it/s]Processed 23 / 36 examples:  64%|██████▍   | 23/36 [00:05<00:02,  4.77it/s]Processed 24 / 36 examples:  64%|██████▍   | 23/36 [00:06<00:02,  4.77it/s]Processed 24 / 36 examples:  67%|██████▋   | 24/36 [00:06<00:03,  3.83it/s]Processed 25 / 36 examples:  67%|██████▋   | 24/36 [00:06<00:03,  3.83it/s]Processed 25 / 36 examples:  69%|██████▉   | 25/36 [00:06<00:02,  4.08it/s]Processed 26 / 36 examples:  69%|██████▉   | 25/36 [00:06<00:02,  4.08it/s]Processed 2

2026/01/19 03:55:44 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=1.0, llm=None) -> 0.0


Processed 27 / 36 examples:  72%|███████▏  | 26/36 [00:06<00:02,  3.78it/s]Processed 28 / 36 examples:  75%|███████▌  | 27/36 [00:07<00:02,  3.78it/s]Processed 28 / 36 examples:  78%|███████▊  | 28/36 [00:07<00:02,  3.16it/s]Processed 29 / 36 examples:  78%|███████▊  | 28/36 [00:07<00:02,  3.16it/s]Processed 30 / 36 examples:  81%|████████  | 29/36 [00:08<00:02,  3.16it/s]Processed 30 / 36 examples:  83%|████████▎ | 30/36 [00:08<00:01,  3.58it/s]Processed 31 / 36 examples:  83%|████████▎ | 30/36 [00:08<00:01,  3.58it/s]Processed 32 / 36 examples:  86%|████████▌ | 31/36 [00:08<00:01,  3.58it/s]Processed 32 / 36 examples:  89%|████████▉ | 32/36 [00:08<00:00,  5.01it/s]Processed 33 / 36 examples:  89%|████████▉ | 32/36 [00:08<00:00,  5.01it/s]Processed 34 / 36 examples:  92%|█████████▏| 33/36 [00:08<00:00,  5.01it/s]Processed 34 / 36 examples:  94%|█████████▍| 34/36 [00:08<00:00,  5.08it/s]Processed 35 / 36 examples:  94%|█████████▍| 34/36 [00:08<00:00,  5.08it/s]Processed 3

2026/01/19 03:55:46 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.7291666666666666

2026/01/19 03:55:46 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.16666666666666663.
2026/01/19 03:55:46 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule





2026/01/19 03:56:01 INFO dspy.teleprompt.simba_utils: Advice for self: If the input question is NOT asking for a “typical play sequence” (i.e., it asks “How do they use X vs Y?”, “what concepts/tendencies?”, “how do they attack coverage?”), then you should NOT penalize primarily for missing an ordered step-by-step sequence; instead grade on whether the output provides (a) concrete schematic explanation (formations/personnel, route concepts, protection/action, intended leverage vs man), (b) actionable coaching takeaways/counters, and (c) at least a few specific examples (even if qualitative) such as common concept families (e.g., boot/flood, crossers, dagger, glance/RPO-like looks) and who they target.

If the output is only clarifying questions or meta-discussion with zero analysis/recommendations, then assign a 1 and explicitly say: it failed to answer the question at all; clarifications can be asked but must come after giving an initial best-effort analysis.

If the output gives some

  0%|          | 0/36 [00:00<?, ?it/s]

2026/01/19 03:56:57 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:44,  1.26s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:44,  1.26s/it]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:42,  1.26s/it]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:12,  2.57it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:12,  2.57it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:12,  2.57it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:06,  4.44it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:06,  4.44it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:01<00:06,  4.44it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:01<00:05,  4.93it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:01<00:05,  4.93it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:02<00:05,  4.93it/s]Processed 9 / 36 examples:  25%|██▌       | 9

2026/01/19 03:56:59 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=1.0, llm=None) -> 0.0


Processed 13 / 36 examples:  33%|███▎      | 12/36 [00:03<00:05,  4.13it/s]Processed 13 / 36 examples:  36%|███▌      | 13/36 [00:03<00:05,  4.25it/s]

2026/01/19 03:56:59 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 14 / 36 examples:  36%|███▌      | 13/36 [00:03<00:05,  4.25it/s]Processed 15 / 36 examples:  39%|███▉      | 14/36 [00:03<00:05,  4.25it/s]Processed 15 / 36 examples:  42%|████▏     | 15/36 [00:03<00:04,  4.59it/s]Processed 16 / 36 examples:  42%|████▏     | 15/36 [00:04<00:04,  4.59it/s]Processed 16 / 36 examples:  44%|████▍     | 16/36 [00:04<00:05,  3.82it/s]Processed 17 / 36 examples:  44%|████▍     | 16/36 [00:04<00:05,  3.82it/s]Processed 17 / 36 examples:  47%|████▋     | 17/36 [00:04<00:04,  3.80it/s]Processed 18 / 36 examples:  47%|████▋     | 17/36 [00:04<00:04,  3.80it/s]Processed 19 / 36 examples:  50%|█████     | 18/36 [00:05<00:04,  3.80it/s]Processed 19 / 36 examples:  53%|█████▎    | 19/36 [00:05<00:04,  3.85it/s]Processed 20 / 36 examples:  53%|█████▎    | 19/36 [00:05<00:04,  3.85it/s]Processed 20 / 36 examples:  56%|█████▌    | 20/36 [00:05<00:04,  3.61it/s]Processed 21 / 36 examples:  56%|█████▌    | 20/36 [00:05<00:04,  3.61it/s]Processed 2

2026/01/19 03:57:06 INFO dspy.teleprompt.simba: Scores after 3 batches: [0.8333333333333334, 0.7916666666666666, 0.5416666666666666, 0.7916666666666666, 0.8333333333333334, 0.7916666666666666], Best: 0.8333333333333334

2026/01/19 03:57:06 INFO dspy.teleprompt.simba: Processing bucket #3, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.253.
2026/01/19 03:57:06 INFO dspy.teleprompt.simba: Starting batch 4 of 8.
2026/01/19 03:57:06 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.50s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.50s/it]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:51,  1.50s/it]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.14it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.14it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:14,  2.14it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:08,  3.56it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:08,  3.56it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:08,  3.56it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:07,  4.11it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:07,  4.11it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:03<00:06,  4.11it/s]Proce

2026/01/19 03:57:15 INFO dspy.teleprompt.simba: Batch 4: Baseline mini-batch score: 0.8472222222222222

2026/01/19 03:57:15 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #1, with max score 1.0, max-to-min gap 0.5, and max-to-avg gap 0.25.
2026/01/19 03:57:15 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_rule





2026/01/19 03:57:27 INFO dspy.teleprompt.simba_utils: Advice for self: If the user asks a direct effectiveness/tendency question like “Which passing plays are most effective … vs zone?”, then you should NOT require an ordered play sequence, down/distance, clock, or a scripted drive. Instead, grade on: (1) whether the answer clearly identifies and ranks the most effective plays/contexts (even if by coverage/rush shell as a proxy), (2) whether it supports claims with multiple concrete metrics (e.g., EPA + success rate + yards) and notes uncertainty (sample size/caveats), and (3) whether it translates findings into actionable coaching takeaways (what to call/avoid or how to defend).

If the output provides multiple quantified comparisons across zone looks (e.g., Cover 4/6/3/2) with EPA/success rate/yardage plus tactical implications, you should generally score it in the 3–4 range even without play-by-play ordering.

Only enforce the “explicit ordered sequence” requirement (and cap at ~3 i

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:56,  1.61s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:56,  1.61s/it]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:54,  1.61s/it]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:53,  1.61s/it]Processed 4 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.02it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.02it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:10,  3.02it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:06,  4.49it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:01<00:06,  4.49it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.49it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:02<00:05,  5.29it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:03<00:05,  5.29it/s]Proces

2026/01/19 03:58:15 INFO dspy.teleprompt.simba: Scores after 4 batches: [0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.7916666666666666, 0.875, 0.875], Best: 0.875

2026/01/19 03:58:15 INFO dspy.teleprompt.simba: Processing bucket #4, with max score 1.0, max-to-min gap 0.5, and max-to-avg gap 0.156.
2026/01/19 03:58:15 INFO dspy.teleprompt.simba: Starting batch 5 of 8.
2026/01/19 03:58:15 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:44,  1.28s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:44,  1.28s/it]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:43,  1.28s/it]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:12,  2.69it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:12,  2.69it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:11,  2.69it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:11,  2.69it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:05,  5.28it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:01<00:05,  5.28it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:01<00:05,  5.28it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:01<00:04,  6.60it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:02<00:04,  6.60it/s]Proce

2026/01/19 03:58:24 INFO dspy.teleprompt.simba: Batch 5: Baseline mini-batch score: 0.7708333333333334

2026/01/19 03:58:24 INFO dspy.teleprompt.simba: Batch 5: Processing bucket #1, with max score 1.0, max-to-min gap 0.25, and max-to-avg gap 0.20833333333333337.
2026/01/19 03:58:24 INFO dspy.teleprompt.simba: Batch 5: Invoking strategy: append_a_rule





2026/01/19 03:58:37 INFO dspy.teleprompt.simba_utils: Advice for self: If the user asks a general tendency question like “What shifts or motions are frequently used…?” (no explicit keywords like sequence/script/play-by-play/what do they call first/next), then you should NOT require an ordered play sequence and you should NOT mention missing sequence as a reason to cap the score.

If the output claims or implies data use (e.g., “I pulled data…”, “from formation/personnel data…”) but does not provide any coach-verifiable evidence (no frequencies, counts/shares, sample size/games, definitions like what qualifies as ‘shift/motion’, or at least 2–3 concrete play examples), then you should cap at 3–4. Treat it as strong but not “excellent” because evidentiary support is vague/hand-wavy.

To award a 5 on this kind of question, require all of the following:
- Clearly defined conditions/terms (what counts as shift vs motion; pre-snap vs at-snap; any relevant personnel/formation buckets).
- Mult

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:48,  1.38s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:48,  1.38s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:25,  1.36it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:25,  1.36it/s]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.19it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.19it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:14,  2.19it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  4.33it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:02<00:07,  4.33it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:02<00:06,  4.43it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:06,  4.43it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.43it/s]Proces

2026/01/19 03:59:27 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=1.0, llm=None) -> 0.0


Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:03<00:05,  4.82it/s]Processed 9 / 36 examples:  25%|██▌       | 9/36 [00:03<00:07,  3.38it/s]Processed 10 / 36 examples:  25%|██▌       | 9/36 [00:03<00:07,  3.38it/s]Processed 10 / 36 examples:  28%|██▊       | 10/36 [00:03<00:08,  3.04it/s]Processed 11 / 36 examples:  28%|██▊       | 10/36 [00:03<00:08,  3.04it/s]Processed 12 / 36 examples:  31%|███       | 11/36 [00:03<00:08,  3.04it/s]Processed 12 / 36 examples:  33%|███▎      | 12/36 [00:03<00:05,  4.00it/s]Processed 13 / 36 examples:  33%|███▎      | 12/36 [00:03<00:05,  4.00it/s]Processed 14 / 36 examples:  36%|███▌      | 13/36 [00:03<00:05,  4.00it/s]Processed 14 / 36 examples:  39%|███▉      | 14/36 [00:03<00:04,  5.37it/s]Processed 15 / 36 examples:  39%|███▉      | 14/36 [00:03<00:04,  5.37it/s]Processed 16 / 36 examples:  42%|████▏     | 15/36 [00:04<00:03,  5.37it/s]Processed 16 / 36 examples:  44%|████▍     | 16/36 [00:04<00:02,  6.89it/s]Processed 17 / 3

2026/01/19 03:59:34 INFO dspy.teleprompt.simba: Scores after 5 batches: [0.7916666666666666, 0.5833333333333334, 0.75, 0.75, 0.75, 0.75], Best: 0.7916666666666666

2026/01/19 03:59:34 INFO dspy.teleprompt.simba: Processing bucket #5, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.25.
2026/01/19 03:59:34 INFO dspy.teleprompt.simba: Starting batch 6 of 8.
2026/01/19 03:59:34 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.50s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.50s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:27,  1.22it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:27,  1.22it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:02<00:27,  1.22it/s]Processed 4 / 36 examples:  11%|█         | 4/36 [00:02<00:11,  2.73it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:02<00:11,  2.73it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:02<00:11,  2.73it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:10,  2.73it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.57it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.57it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:03<00:05,  5.57it/s]Proce

2026/01/19 03:59:44 INFO dspy.teleprompt.simba: Batch 6: Baseline mini-batch score: 0.875

2026/01/19 03:59:44 INFO dspy.teleprompt.simba: Batch 6: Processing bucket #1, with max score 1.0, max-to-min gap 0.25, and max-to-avg gap 0.20833333333333337.
2026/01/19 03:59:44 INFO dspy.teleprompt.simba: Batch 6: Invoking strategy: append_a_rule





2026/01/19 03:59:58 INFO dspy.teleprompt.simba_utils: Advice for self: If the user asks a tendency question (e.g., “What shifts or motions are frequently used…”) and the output provides plausible categories + counters but does NOT provide concrete support (no frequencies, no game/film examples, no definition of what timeframe/data source was used), then you should score it a 4 (mostly acceptable), not a 5, even if it is detailed and actionable. 

If the output makes strong evidentiary claims like “I pulled 2024 data” or implies predictive indicators (“RB depth indicates inside/outside zone”) without showing how measured (counts, %, sample size, or at least specific examples), then treat it as ‘strong but not fully substantiated’ and cap at 4; explicitly note the missing evidence as the reason it’s not a 5. 

Reserve a 5 only when: (a) the answer stays accurate to the ask (tendencies, not a scripted sequence unless requested), AND (b) it is both highly actionable AND well-supported/grou

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:57,  1.64s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:57,  1.64s/it]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:55,  1.64s/it]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.14it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:15,  2.14it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:14,  2.14it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  3.88it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  3.88it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:07,  3.88it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.20it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.20it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:02<00:06,  4.42it/s]Proces

2026/01/19 04:00:54 INFO dspy.teleprompt.simba: Scores after 6 batches: [0.8333333333333334, 0.75, 0.8333333333333334, 0.875, 0.8333333333333334, 0.8333333333333334], Best: 0.875

2026/01/19 04:00:54 INFO dspy.teleprompt.simba: Processing bucket #6, with max score 1.0, max-to-min gap 0.25, and max-to-avg gap 0.149.
2026/01/19 04:00:54 INFO dspy.teleprompt.simba: Starting batch 7 of 8.
2026/01/19 04:00:54 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:37,  1.06s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:37,  1.06s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:19,  1.71it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:19,  1.71it/s]Processed 3 / 36 examples:   8%|▊         | 3/36 [00:01<00:14,  2.30it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:14,  2.30it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:13,  2.30it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  4.25it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  4.25it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:07,  4.25it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.53it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:06,  4.53it/s]Proce

2026/01/19 04:00:57 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 10 / 36 examples:  25%|██▌       | 9/36 [00:02<00:07,  3.73it/s]Processed 10 / 36 examples:  28%|██▊       | 10/36 [00:02<00:06,  4.22it/s]Processed 11 / 36 examples:  28%|██▊       | 10/36 [00:03<00:06,  4.22it/s]Processed 11 / 36 examples:  31%|███       | 11/36 [00:03<00:05,  4.43it/s]Processed 12 / 36 examples:  31%|███       | 11/36 [00:03<00:05,  4.43it/s]Processed 13 / 36 examples:  33%|███▎      | 12/36 [00:03<00:05,  4.43it/s]Processed 13 / 36 examples:  36%|███▌      | 13/36 [00:03<00:04,  5.36it/s]Processed 14 / 36 examples:  36%|███▌      | 13/36 [00:03<00:04,  5.36it/s]Processed 14 / 36 examples:  39%|███▉      | 14/36 [00:03<00:04,  5.11it/s]Processed 15 / 36 examples:  39%|███▉      | 14/36 [00:03<00:04,  5.11it/s]Processed 15 / 36 examples:  42%|████▏     | 15/36 [00:03<00:03,  5.58it/s]Processed 16 / 36 examples:  42%|████▏     | 15/36 [00:03<00:03,  5.58it/s]Processed 17 / 36 examples:  44%|████▍     | 16/36 [00:04<00:03,  5.58it/s]Processed 17

2026/01/19 04:01:00 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 26 / 36 examples:  69%|██████▉   | 25/36 [00:06<00:02,  4.20it/s]Processed 26 / 36 examples:  72%|███████▏  | 26/36 [00:06<00:02,  4.65it/s]Processed 27 / 36 examples:  72%|███████▏  | 26/36 [00:06<00:02,  4.65it/s]Processed 27 / 36 examples:  75%|███████▌  | 27/36 [00:06<00:01,  4.51it/s]Processed 28 / 36 examples:  75%|███████▌  | 27/36 [00:07<00:01,  4.51it/s]Processed 28 / 36 examples:  78%|███████▊  | 28/36 [00:07<00:02,  2.96it/s]Processed 29 / 36 examples:  78%|███████▊  | 28/36 [00:07<00:02,  2.96it/s]Processed 30 / 36 examples:  81%|████████  | 29/36 [00:07<00:02,  2.96it/s]Processed 31 / 36 examples:  83%|████████▎ | 30/36 [00:07<00:02,  2.96it/s]Processed 31 / 36 examples:  86%|████████▌ | 31/36 [00:07<00:01,  4.55it/s]Processed 32 / 36 examples:  86%|████████▌ | 31/36 [00:07<00:01,  4.55it/s]Processed 33 / 36 examples:  89%|████████▉ | 32/36 [00:07<00:00,  4.55it/s]Processed 34 / 36 examples:  92%|█████████▏| 33/36 [00:07<00:00,  4.55it/s]Processed 3

2026/01/19 04:01:05 INFO dspy.teleprompt.simba: Batch 7: Baseline mini-batch score: 0.7083333333333334

2026/01/19 04:01:05 INFO dspy.teleprompt.simba: Batch 7: Processing bucket #1, with max score 0.5, max-to-min gap 0.5, and max-to-avg gap 0.16666666666666669.
2026/01/19 04:01:05 INFO dspy.teleprompt.simba: Batch 7: Invoking strategy: append_a_rule
2026/01/19 04:01:05 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score 0.5 is at or below the 10th percentile *or* bad score 0.0 is at or above the 90th percentile.
2026/01/19 04:01:05 INFO dspy.teleprompt.simba: 

2026/01/19 04:01:05 INFO dspy.teleprompt.simba: Batch 7: Processing bucket #2, with max score 1.0, max-to-min gap 0.25, and max-to-avg gap 0.125.
2026/01/19 04:01:05 INFO dspy.teleprompt.simba: Batch 7: Invoking strategy: append_a_rule





2026/01/19 04:01:13 INFO dspy.teleprompt.simba_utils: Advice for self: If the input asks generally about tendencies/adjustments (e.g., “How does the offense change formations in the 2nd half?”) and does NOT include explicit sequence/script/play-by-play keywords (“typical sequence”, “script”, “play-by-play”, “what do they call first/next”, “Step 1/2/3”, “with under X minutes”), then you should NOT require an ordered series of plays and you should NOT downgrade for lacking one. Instead, award based on: (1) whether the output summarizes formation/personnel shifts clearly (ranked usage, deltas, or at least what’s most common), (2) whether numbers/claims are interpreted correctly and tied to the question, and (3) whether recommendations are specific and actionable based on those tendencies. Only mention missing ‘ordered sequence’ as a limitation when the user explicitly requested a sequence/script/play-by-play; otherwise, focus your critique on evidence quality (definitions, sample sizes/ti

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:51,  1.47s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:51,  1.47s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:23,  1.45it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:23,  1.45it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:22,  1.45it/s]Processed 4 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.00it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:10,  3.00it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:02<00:10,  3.00it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:02<00:07,  4.02it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:07,  4.02it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:07,  4.02it/s]Processed 9 / 36 examples:  22%|██▏       | 8/36 [00:02<00:06,  4.02it/s]Proces

2026/01/19 04:01:57 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=3.0, llm=None) -> 0.0


Processed 32 / 36 examples:  86%|████████▌ | 31/36 [00:07<00:01,  4.71it/s]Processed 32 / 36 examples:  89%|████████▉ | 32/36 [00:07<00:01,  3.89it/s]Processed 33 / 36 examples:  89%|████████▉ | 32/36 [00:08<00:01,  3.89it/s]Processed 33 / 36 examples:  92%|█████████▏| 33/36 [00:08<00:00,  3.24it/s]

2026/01/19 04:01:58 INFO dspy.teleprompt.simba: LIKERT: missing scores (human=1.0, llm=None) -> 0.0


Processed 34 / 36 examples:  92%|█████████▏| 33/36 [00:08<00:00,  3.24it/s]Processed 34 / 36 examples:  94%|█████████▍| 34/36 [00:08<00:00,  3.58it/s]Processed 35 / 36 examples:  94%|█████████▍| 34/36 [00:08<00:00,  3.58it/s]Processed 36 / 36 examples:  97%|█████████▋| 35/36 [00:08<00:00,  3.58it/s]Processed 36 / 36 examples: 100%|██████████| 36/36 [00:08<00:00,  4.35it/s]Processed 36 / 36 examples: 100%|██████████| 36/36 [00:08<00:00,  4.05it/s]

2026/01/19 04:01:58 INFO dspy.teleprompt.simba: Scores after 7 batches: [0.75, 0.75, 0.7083333333333334, 0.7916666666666666, 0.75, 0.5], Best: 0.7916666666666666

2026/01/19 04:01:58 INFO dspy.teleprompt.simba: Processing bucket #7, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.292.
2026/01/19 04:01:58 INFO dspy.teleprompt.simba: Starting batch 8 of 8.
2026/01/19 04:01:58 INFO dspy.teleprompt.simba: Sampling program trajectories on 6 examples x 6 samples.



  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:48,  1.40s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:48,  1.40s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:22,  1.52it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:22,  1.52it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:21,  1.52it/s]Processed 4 / 36 examples:  11%|█         | 4/36 [00:01<00:09,  3.31it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:09,  3.31it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:09,  3.31it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:05,  5.25it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:05,  5.25it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.25it/s]Processed 8 / 36 examples:  22%|██▏       | 8/36 [00:02<00:05,  5.50it/s]Proce

2026/01/19 04:02:07 INFO dspy.teleprompt.simba: Batch 8: Baseline mini-batch score: 0.6527777777777778

2026/01/19 04:02:07 INFO dspy.teleprompt.simba: Batch 8: Processing bucket #1, with max score 1.0, max-to-min gap 0.25, and max-to-avg gap 0.16666666666666663.
2026/01/19 04:02:07 INFO dspy.teleprompt.simba: Batch 8: Invoking strategy: append_a_rule
2026/01/19 04:02:07 INFO dspy.teleprompt.simba_utils: Skipping rule generation as good score 1.0 is at or below the 10th percentile *or* bad score 0.75 is at or above the 90th percentile.
2026/01/19 04:02:07 INFO dspy.teleprompt.simba: 

2026/01/19 04:02:07 INFO dspy.teleprompt.simba: Batch 8: Processing bucket #2, with max score 0.75, max-to-min gap 0.25, and max-to-avg gap 0.16666666666666663.
2026/01/19 04:02:07 INFO dspy.teleprompt.simba: Batch 8: Invoking strategy: append_a_rule





2026/01/19 04:02:21 INFO dspy.teleprompt.simba_utils: Advice for self: When the input question contains “typical play sequence / sequence / script / play-by-play / what do they call first/next / under X minutes”, you must verify the output includes an explicit ordered progression of plays (e.g., “Play 1… Play 2…”, or a sample drive) with situational context like clock/TOs/field position and down & distance. Numbered headings like “1. Formation, 2. Play type priorities” are NOT sufficient if they are just tendencies.

If the output is primarily tendencies (pass rate, personnel frequency, average air yards, route families) and generic recommendations, and does not translate those into an actual series (e.g., “open with quick sideline concept → if incomplete, tempo again; if complete in bounds, call timeout; then take a shot on 1st-and-10; then red-zone/FG sequence”), cap the score around 3 even if the football analysis is otherwise strong.

Do not give a 5 when the answer says “If you wa

  0%|          | 0/36 [00:00<?, ?it/s]Processed 1 / 36 examples:   0%|          | 0/36 [00:01<?, ?it/s]Processed 1 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.49s/it]Processed 2 / 36 examples:   3%|▎         | 1/36 [00:01<00:52,  1.49s/it]Processed 2 / 36 examples:   6%|▌         | 2/36 [00:01<00:23,  1.44it/s]Processed 3 / 36 examples:   6%|▌         | 2/36 [00:01<00:23,  1.44it/s]Processed 4 / 36 examples:   8%|▊         | 3/36 [00:01<00:22,  1.44it/s]Processed 5 / 36 examples:  11%|█         | 4/36 [00:01<00:22,  1.44it/s]Processed 5 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  4.11it/s]Processed 6 / 36 examples:  14%|█▍        | 5/36 [00:01<00:07,  4.11it/s]Processed 6 / 36 examples:  17%|█▋        | 6/36 [00:01<00:06,  4.75it/s]Processed 7 / 36 examples:  17%|█▋        | 6/36 [00:02<00:06,  4.75it/s]Processed 7 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.47it/s]Processed 8 / 36 examples:  19%|█▉        | 7/36 [00:02<00:05,  5.47it/s]Proces

2026/01/19 04:02:53 INFO dspy.teleprompt.simba: Scores after 8 batches: [0.625, 0.875, 0.5833333333333334, 0.625, 0.5833333333333334, 0.7916666666666666], Best: 0.875

2026/01/19 04:02:53 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 7 programs on the full trainset.



  0%|          | 0/217 [00:00<?, ?it/s]Processed 1 / 217 examples:   0%|          | 0/217 [00:01<?, ?it/s]Processed 1 / 217 examples:   0%|          | 1/217 [00:01<04:31,  1.26s/it]Processed 2 / 217 examples:   0%|          | 1/217 [00:01<04:31,  1.26s/it]Processed 3 / 217 examples:   1%|          | 2/217 [00:01<04:30,  1.26s/it]Processed 4 / 217 examples:   1%|▏         | 3/217 [00:01<04:28,  1.26s/it]Processed 4 / 217 examples:   2%|▏         | 4/217 [00:01<00:57,  3.71it/s]Processed 5 / 217 examples:   2%|▏         | 4/217 [00:01<00:57,  3.71it/s]Processed 6 / 217 examples:   2%|▏         | 5/217 [00:01<00:57,  3.71it/s]Processed 6 / 217 examples:   3%|▎         | 6/217 [00:01<00:48,  4.39it/s]Processed 7 / 217 examples:   3%|▎         | 6/217 [00:02<00:48,  4.39it/s]Processed 8 / 217 examples:   3%|▎         | 7/217 [00:02<00:47,  4.39it/s]Processed 8 / 217 examples:   4%|▎         | 8/217 [00:02<00:49,  4.21it/s]Processed 9 / 217 examples:   4%|▎         | 8/217 [00

2026/01/19 04:03:39 INFO dspy.teleprompt.simba: Final trainset scores: [0.6612903225806451, 0.6935483870967742, 0.8548387096774194, 0.75, 0.7580645161290323, 0.8387096774193549, 0.8145161290322581], Best: 0.8548387096774194 (at index 2)



2026/01/19 04:03:39 INFO mlflow.genai.judges.optimizers.simba: SIMBA optimization completed
2026/01/19 04:03:39 INFO dspy.teleprompt.simba: Processing bucket #8, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.258.
2026/01/19 04:03:39 INFO dspy.teleprompt.simba: Scores after 8 batches: [0.521, 0.76, 0.747, 0.844, 0.75, 0.851, 0.708, 0.742], Best: 1.0





In [0]:
print("Original instructions:\n", football_analysis_judge.instructions)
print("\nAligned instructions:\n", aligned_judge.instructions)

Original instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage

Aligned instructions:
 Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, 

In [0]:
from mlflow.genai.judges import make_judge
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    ScorerSamplingConfig,
    get_scorer
)

mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

register_align_judge = make_judge(
    name=f"{ALIGNED_JUDGE_NAME}_likert",
    instructions=aligned_judge.instructions,
    feedback_value_type=float,
    # model=JUDGE_MODEL,  # Model used to evaluate (from config)
)

try:
    register_aligned_judge = register_align_judge.register(experiment_id=EXPERIMENT_ID)

except ValueError as e:
    msg = str(e)

    if "has already been registered" in msg:
        # Preferred path per the error message: update existing scorer
        register_aligned_judge = register_align_judge.update(
            experiment_id=EXPERIMENT_ID,
            sampling_config=ScorerSamplingConfig(sample_rate=1)
        )
    else:
        raise

print("Registered aligned judge", register_align_judge.name)

Registered aligned judge football_analysis_judge_align_likert


In [0]:
print(register_aligned_judge.instructions)

Evaluate if the response in {{ outputs }} appropriately analyzes the available data and provides an actionable recommendation the question in {{ inputs }}. The response should be accurate, contextually relevant, and give a strategic advantage to the  person making the request. Your grading criteria should be:  1: Completely unacceptable. Incorrect data interpretation or no recommendations 2: Mostly unacceptable. Irrelevant or spurious feedback or weak recommendations provided with minimal strategic advantage 3: Somewhat acceptable. Relevant feedback provided with some strategic advantage 4: Mostly acceptable. Relevant feedback provided with strong strategic advantage 5 Completely acceptable. Relevant feedback provided with excellent strategic advantage

If the input question asks for a **“typical play sequence”** (keywords: “sequence”, “play-by-play”, “what do they call first/next”, “script”, “with under X minutes”), then you should **check that the output contains an explicit ordered 

###Rerun with Likert SIMBA Optimizer

In [0]:
from agent import AGENT
from mlflow.genai import evaluate
from mlflow.genai.datasets import create_dataset, get_dataset
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    get_scorer,
)

# Compile the judges and rerun the evaluation job

football_language = "The response must use language that is appropriate for professional football players and coaches"
football_language_judge = Guidelines(name="football_language", guidelines=football_language)

scorers = [RelevanceToQuery(), football_language_judge, register_aligned_judge]
# Rerun the evaluation after recreating the judge to better calibrate the agent quality

mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

# Grab all traces from the original eval dataset
eval_dataset = get_dataset(name=DATASET_NAME)

def extract_question(row_input):
    try:
      messages = row_input['request']['input']
      last_message = messages[-1]
      return last_message['content']
    except Exception as e:
      print(e)

df = eval_dataset.to_df()
eval_dataset_records = [
    {
        "inputs": {
            "input": [
                # Pass the EXTRACTED string, not the whole dictionary object
                {"role": "user", "content": extract_question(row)}
            ]
        }
        # Note: "expected" field is optional
    }
    for row in df['inputs'] 
]

print("Executing Evaluation Job")
results = evaluate(
    data=eval_dataset_records,
    predict_fn=lambda input: AGENT.predict({"input": input}),
    scorers=scorers
)

2026/01/19 04:42:41 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2026/01/19 04:42:41 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.


Executing Evaluation Job


Evaluating:   0%|          | 0/32 [Elapsed: 00:00, Remaining: ?] 

