In [0]:
# Install the Databricks LangChain integration package
# Install Unity Catalog AI integration packages with the Databricks extra

%pip install --upgrade databricks-agents unitycatalog-ai[databricks] unitycatalog-langchain[databricks] databricks-langchain databricks-vectorsearch==0.56 langchain==0.3.20 langgraph==0.3.4 pydantic==2.11.7 mlflow[databricks]
dbutils.library.restartPython()

In [0]:
CATALOG = 'media_advertising'
SCHEMA = 'contextual_advertising'
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA {SCHEMA}")

%md
# Evaluation Process
 
1. Create a synthetic dataset of possible requests using generate_eval_df()
2. Create a custom metric that we can leverage in the evaluation process
3. Pass the requests generated Agent Evaluation LLM as a judge using built-in-metrics and the custom metric defined
4. Spin up human evaluations and have "experts" evaluate
5. Walk through an example human review via the review app, and show how it can be synced to a dataset. (not included in this notebook)



# Generate Synthetic Data

First we'll grab some scripts from our scripts database, and then create a custom synthetic dataset based on our problem

In [0]:
from databricks.agents.evals import generate_evals_df
from pyspark.sql.functions import concat, lit, col

catalog_name = CATALOG
schema_name = SCHEMA

# Load in scripts and get a random sample to generate mock examples
volume_path = f'/Volumes/{catalog_name}/{schema_name}/scripts'
movie_scripts_df = spark.read.format('delta').load(volume_path)
random_movies_scripts = movie_scripts_df.sample(withReplacement=False, fraction=0.03, seed=42) # Raise or lower fraction to increase/decrease sample size
uri_constant = f'{catalog_name}.{schema_name}.raw_movie_scripts'
sample_df = random_movies_scripts.select(
    col("script").alias("content"),
    concat(lit(uri_constant), lit("_"), col("unique_movie_id")).alias("doc_uri")
)
display(sample_df.limit(10))

# generate_evals_df

**Purpose**: Generate a synthetic dataset of requests to use to evaluate our agent. 

- `question_guidelines`: Gives instructions how to generate the required input data to bootstrap an eval dataset
- `agent_description`: Gives an overview of the purpose of the agent to help guide the questions
- `docs`: Documents that are used for the sample request generation. 
- `num_evals`: The number of observations to create (total, not per document)

We're grabbing 50 random scripts as the means to generate the data, and are asking the model to create generic user requests (rather than movie specific). 

In [0]:
# Define the synthetic data generation

question_guidelines = """
# User personas
- An account executive who is responsible for contextual ad placement within shows
- An enterprise executive who is responsible for the P&L and wants to optimize ad placement in shows

# Example questions
- When could I insert a commercial for a light hearted comedy movie we want to promote for next summer?
- When could I insert a commercial for a boys and girls club non-profit ad campaign?

# Additional Guidelines
- Question should be succinct with the goal of optimizing the relevance of advertising within a script.
- The question should be generic, use the documents as a generalized framework to ask questions about movies.
- NEVER reference specific scenes or characters. The full application will be asking questions across multiple scripts at once, not a specific show.
"""

agent_description = """
The Agent is a RAG chatbot that aims to recommend the optimal placement for advertising within scripts. The scripts are movies, but they are still intending to air commercials even though that is traditionally associated with TV. The Agent has access to a movie metadata and genre, and its task is to answer the user's questions by retrieving the relevant script chunks from the corpus and synthesizing a helpful, accurate response of where it makes sense to insert the ad placement. End users will be using this agent across many scripts at once, so questions will be generic across the full script database, rather than about specific shows.
"""

eval_df = generate_evals_df(docs=sample_df, num_evals=50, agent_description=agent_description, question_guidelines=question_guidelines)
eval_df['inputs'] = [{'input': row['messages']} for row in eval_df['inputs']] # For Responses Agent
display(eval_df)

**Key outputs of the synthetic data generated**

- `request`: This is the mock user request that we will use with our agent
- `expected_retrieved_content`: The context the evaluation has been sourced from. This _can_ be useful, but it's restricted to a single document, so might not be as useful depending on the application. We will not be using that for this exercise.
- `expected_facts`: Expected facts that should be returned in the response. Similar to `expected_retreived_content`, this _can_ be useful, but it's restricted to a single document, so might not be as useful depending on the application. We will not be using that for this exercise
- `source_id`: The document used to produce the observation

More detail can be found here: https://docs.databricks.com/aws/en/generative-ai/agent-evaluation/synthesize-evaluation-set

# Generate evaluations

`evaluate` is the backbone of the evaluation process, and you can generate evaluations in multiple ways. In this case, we're going to be pre-computing request-response pairs, and pass those to the LLM as a judge. As an alternative, you can pass requests only and have your agent compute responses on the fly, which are then evaluated. This approach allows a bit more flexibility where you can customize what is showing up in the request and response pairs, as well as pull in observations that are potentially happening outside of a Databricks-built Agent/Endpoint. In this case, we're just going to pass the full input/output returned by the model to be evaluated.

Additionally we've provided some guidelines for the judge to leverage when evaluating each observation.

So in the cell below, we'll be calling our endpoint with requests we generated in the synthetic data generation process, and computing responses from our agent. It should take roughly ~20 minutes to generate 50 examples on Serverless compute.

For more background on passing in evaluations, see this documentation: https://docs.databricks.com/aws/en/generative-ai/agent-evaluation/evaluation-schema

In [0]:
import mlflow
from mlflow.deployments import get_deploy_client
from databricks.agents import datasets
# Leverage the created mock dataset for evaluation

# Add in a guideline about genre etc...?

agent_endpoint = 'agents_media_advertising-contextual_advertising-movie_scripts_c' 

guidelines = ['The retrieved content from the script database must be contextually relevant to the user request.',
              'The retreived content must be relevant to making ad placement decisions.']

request_list = [cont['messages'][0]['content'] for cont in eval_df["inputs"].tolist()]
client = get_deploy_client()
endpoint = agent_endpoint
agent_output_list = []
for request in request_list: # Call our endpoint and compile the request response pairs
  output = client.predict(endpoint=endpoint, inputs={"messages": [{"role": "user", "content": request}]})
  agent_output_list.append({"request": request, "response": output['choices'][0]['message']['content']})

# Create evaluation dataset 
# Could also add in the expected retreived content and expected facts, but per the commentary above they don't fit well because they are generated from one document.

evals = [{
  "inputs": {'query': output['request']},
  "response": output['response'], 
  "guidelines": guidelines
} for output in agent_output_list]

# Below is the dataset that is used for the Review App

uc_eval_dataset = f'{catalog_name}.{schema_name}.review_app_dataset'
datasets.delete_dataset(uc_eval_dataset) # Uncomment if interested in recreating the dataset
dataset = datasets.create_dataset(uc_eval_dataset) # Uncomment out after initial creation
evals_app = [{
  "request": {'messages': [{"role": "user", "content": output['request']}]},
  "guidelines": guidelines
} for output in agent_output_list[10:15]] # Pick specific observations if interested - this is a random subset of the list
dataset.merge_records(evals_app)

# Execute Evaluation

We're passing in the dataset we generated of request-response pairs, and including the custom metric we created at the top of the notebook (`script_fit_custom_metric`), as well as some of the built-in metrics that are useful for us in thie context. 

When the execution job completes, click on the link generates to view the results in the Tracing tab, and dig into any problematic observations.

In [0]:
import mlflow
import json
from mlflow.genai.scorers import scorer
from mlflow.entities import AssessmentSource, AssessmentSourceType, Feedback
from typing import Any, Optional
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
client = w.serving_endpoints.get_open_ai_client()

# Define the prompts for the Judge LLM.
judge_system_prompt = """
You are an impartial AI assistant responsible for evaluating the quality of a response generated by another AI model.
Your evaluation should be based on the original user query and the AI's response.
The context of the conversations is an user looking to find the most relevant script for their advertising scenario
Provide a quality score as an integer from 1 to 5 (1=Poor, 2=Fair, 3=Good, 4=Very Good, 5=Excellent) that reflects the scene relevance.
A 5 should reflect a perfect fit, a 1 should reflect a very weak or completely incorrect fit.
Also, provide a brief rationale for your score in under 200 tokens. Be a very harsh critic and give out 5s sparingly. 
ONLY evaluate the final answer given, do not evaluate the options provided from the vector search tool call process

Your output MUST ONLY be a single valid JSON object with two keys: "score" (an integer) and "rationale" (a string).
The format must conform to this format
Example:
{"score": 4, 
 "rationale": "The scene returned fit the user request well but not perfectly and the format was correct."}
"""
judge_user_prompt = """
Please evaluate the AI's Response below based on the Original User Query.

Original User Query:
```{user_query}```

AI's Response:
```{llm_response_from_app}```

Provide your evaluation strictly as a JSON object with "score" and "rationale" keys.
"""

@scorer
def script_fit_custom_metric(inputs: dict[str, Any], outputs: str) -> Feedback:
    user_query = inputs["input"][-1]["content"]
    # Call the Judge LLM using the OpenAI SDK client.
    judge_llm_response_obj = client.chat.completions.create(
        model="databricks-meta-llama-3-3-70b-instruct",  # This example uses Databricks hosted Claude. If you provide your own OpenAI credentials, replace with a valid OpenAI model e.g., gpt-4o-mini, etc.
        messages=[
            {"role": "system", "content": judge_system_prompt},
            {"role": "user", "content": judge_user_prompt.format(user_query=user_query, llm_response_from_app=outputs)},
        ],
        max_tokens=200,  # Max tokens for the judge's rationale
        temperature=0.0, # For more deterministic judging
    )
    judge_llm_output_text = judge_llm_response_obj.choices[0].message.content
    judge_output_json = json.loads(judge_llm_output_text)
    score = judge_output_json.get("score")
    rationale = judge_output_json.get("rationale")

    # Parse the Judge LLM's JSON output.
    judge_eval_json = json.loads(judge_llm_output_text)
    parsed_score = int(judge_eval_json["score"])
    parsed_rationale = judge_eval_json["rationale"]

    return Feedback(
        value=parsed_score,
        rationale=parsed_rationale,
        # Set the source of the assessment to indicate the LLM judge used to generate the feedback
        source=AssessmentSource(
            source_type=AssessmentSourceType.LLM_JUDGE,
            source_id="databricks-meta-llama-3-3-70b-instruct",
        )
    )

In [None]:
from agent import AGENT
from mlflow.genai.scorers import (
    Guidelines,
    RelevanceToQuery,
    Safety,
)
import pandas as pd
from typing import Any

mlflow.langchain.autolog()
with mlflow.start_run(run_name="Movie-Eval-LLM"):
    eval_results = mlflow.genai.evaluate(
        data=eval_df,
        predict_fn=lambda input: AGENT.predict({"input": input}),
        scorers=[
            RelevanceToQuery(),
            Safety(),
            # You can have any number of guidelines.
            Guidelines(
                name="ReturnFormat",
                guidelines= "Assess whether output contains Movie, Scene Number, Scene Description, Scene Justification"
            ),
            script_fit_custom_metric
        ],
    )

# Human Evaluation

Now we've got a robust LLM-as-a-Judge process, but for some problems (particularly high-value or risk, or problems that require significant contextual and subject matter understanding), human evaluation is necessary to feel confident in the solution.

The **Review App** comes by default when you deploy a model. It can be used for custom labeling sessions, or just interacting directly with the agent. In this case, it is just enabled for free-form conversation with the agent.

The Review App can be found at this location: https://e2-demo-west.cloud.databricks.com/ml/review-v2/chat?endpoint=agents_media_advertising-contextual_advertising-movie_scripts_c

# END OF DEMO in E2-DEMO-WEST-WS

# The Code Below will not work in this Environment

Due to lack of permissions. But it will work in other environments (including e2-field-demo-west). Taking this repo and leveraging in other more permissive environments will enable the functionality described below (labeling sessions for SMEs)

More detail can be found at this link: https://docs.databricks.com/aws/en/generative-ai/agent-evaluation/review-app

In [0]:
from databricks import agents
import mlflow
from databricks.agents import review_app

catalog_name = CATALOG
schema_name = SCHEMA
model_name = "movie_scripts_chatbot_agent" # Change to a different model name if desired
UC_MODEL_NAME = f"{catalog_name}.{schema_name}.{model_name}"
uc_model_version = uc_registered_model_info.version # Update to a different version if the endpoint iterates
user_list = ["INSERT USERS"] # Update with users that need permissions

# Set a list of users to review the app
# Note that <user_list> can specify individual users or groups.
agents.set_permissions(model_name=UC_MODEL_NAME, users=user_list, permission_level=agents.PermissionLevel.CAN_QUERY)


In [0]:
from databricks.agents import datasets, review_app

# register and create review app dataset

exp_id = 'b822d31cbc8e4124913eabd45ad580bf' # Get experiment ID associated with deployed model
run_id = 'a4e2e4d6b4004be390ffddfe44b6825c' # Get run ID associated with 
script_app = review_app.get_review_app(exp_id) #Experiment ID for the endpoint
script_app = script_app.add_agent(
  agent_name="script_agent",
  model_serving_endpoint=agent_endpoint,
  overwrite=True
)

# Labeling Sessions

Below is a custom labeling session we're asking our subject matter experts to go through.

`create_label_schema` defines the label details - in this case we've defined two different metrics, one is an **expectation**, which gets integrated into guidelines for the LLM as a Judge in the future, and the other as **feedback** which provides detail into the quality of the output for evaluation purposes. Our dataset contains 5 observations, and we're asking our reviewers to provide feedback on both of these metrics for each observation.

In [0]:
# Create label schemas for the review app including instructions
# Customize these as needed

traces = mlflow.search_traces(run_id=run_id) # Need to get run_id
quality_exp_label = script_app.create_label_schema(name='quality_expectation',
                                        type='expectation',
                                          title="Movie Quality Appropriateness for Ads",
                                          input=review_app.label_schemas.InputText(),
                                          instruction="Evaluate whether the quality of the movie meets the requirement for the ad placement",
                                          enable_comment=True,
                                          overwrite=True)

quality_label = script_app.create_label_schema(name='quality_feedback',
                                        type='feedback',
                                          title="Quality of the retrieved content for Ad Placement",
                                          input=review_app.label_schemas.InputNumeric(min_value= 1.0, max_value=5.0),
                                          instruction="""Determine the quality of the retrieved content for ad placement decisions.
                                          1=The retreived content is not relevant AND could not be used for ad placement decisions
                                          2=The retreived content is not relevant OR could not be used for ad placement decisions
                                          3=The retreived content is relevant and could be used for some ad placement decisions
                                          4=The retreived content is relevant and could be used for most ad placement decisions
                                          5=The retreived content is relevant and could be used for all ad placement decisions""",
                                          enable_comment=True,
                                          overwrite=True)

# Create a labeling session to evaluate that specific dataset generated

script_labeling_session = script_app.create_labeling_session(
  name="script_demo_session",
  agent='script_agent',
  assigned_users = ["INSERT USERS"],
  label_schemas = ['quality_expectation', 'quality_feedback'] # Change to names of any metrics created
)
# Add the records from the dataset to the labeling session.
script_labeling_session.add_dataset(uc_eval_dataset)
# NOTE: This copies the traces into this labeling session so that labels do not modify the original traces.
#script_labeling_session.add_traces(traces)
   

The dataset below is what is being updated by the user feedback. The guidelines in there include generic guideliens on each observation, as well as some feedback specific to the observation. In the second example, specific feedback not to return violent UFC or boxing movies.

Now let's launch the review app and review an example, then reload the dataset to see it updated - see the link produced after running the cell above

Note that this dataset is a different paradigm than the request-response approach we went through before, in this case just providing a request, and the response is generated in real time (which we will see in the UI!)

In [0]:
import mlflow
from databricks.agents import datasets, review_app

script_app = review_app.get_review_app()
script_labeling_session = script_app.get_labeling_sessions()[0]
script_labeling_session.sync_expectations(uc_eval_dataset)
display(spark.read.table(uc_eval_dataset))



The cell below shows some example output generated by our reviewers in the form of feedback

In [0]:
import mlflow
# Check the progress of reviews

for row in mlflow.search_traces(run_id=run_id).to_dict(orient='records'):
  print(f'{row["request"]}: {row["assessments"]}\n')

In [0]:
import base64
import os
from databricks.sdk import WorkspaceClient
from IPython.display import Image

image_name = "dogfood.jpeg" #"catfood.jpg"
display(Image(image_name))

w = WorkspaceClient()
client = w.serving_endpoints.get_open_ai_client()

with open(image_name, "rb") as image_file:
  base64_image = base64.b64encode(image_file.read()).decode('utf-8')

image_data_uri = f"data:image/jpeg;base64,{base64_image}"

response = client.chat.completions.create(
  model="databricks-llama-4-maverick",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Describe this advertisement concisely in one sentence with a description of the brand and what is happening in the image"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": image_data_uri
          }
        }
      ]
    }
  ]
)
print(response.choices[0].message.content)

In [0]:
from mlflow.entities import SpanType

mlflow.set_tracking_uri("databricks")
agent_input = f"Where should I place the following advertisement: {response.choices[0].message.content}"
agent_endpoint = 'agents_media_advertising-contextual_advertising-movie_scripts_p' 

@mlflow.trace(span_type=SpanType.LLM)
def invoke(ag_input):
  agent_client = get_deploy_client()
  span = mlflow.get_current_active_span()
  # Set the attribute to the span
  span.set_attributes({"model": agent_endpoint})
  output = agent_client.predict(endpoint=agent_endpoint, inputs={"messages": [{"role": "user", "content": ag_input}]})

  return output['choices'][0]['message']['content']
  #payload = {"request": request, "response": output['choices'][0]['message']['content']}
model_output = invoke(agent_input)  

In [0]:
print(model_output)