In [3]:
import os

import openai
import pandas as pd

import mlflow
from mlflow.metrics import make_metric
from mlflow.metrics.genai import EvaluationExample, answer_similarity

assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."


def custom_metric(targets, predictions, answer_similarity):
    print("answer sim", answer_similarity)
    return 8  # Some dummy value


# testing with OpenAI gpt-3.5-turbo
example = EvaluationExample(
    input="What is MLflow?",
    output="MLflow is an open-source platform for managing machine "
    "learning workflows, including experiment tracking, model packaging, "
    "versioning, and deployment, simplifying the ML lifecycle.",
    score=4,
    justification="The definition effectively explains what MLflow is "
    "its purpose, and its developer. It could be more concise for a 5-score.",
    grading_context={
        "targets": "MLflow is an open-source platform for managing "
        "the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, "
        "a company that specializes in big data and machine learning solutions. MLflow is "
        "designed to address the challenges that data scientists and machine learning "
        "engineers face when developing, training, and deploying machine learning models."
    },
)

answer_similarity_metric = answer_similarity(examples=[example])

eval_df = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
            "What is Python?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
            "Python is a high-level programming language that was created by Guido van Rossum and released in 1991. It emphasizes code readability and allows developers to express concepts in fewer lines of code than languages like C++ or Java. Python is used in various domains, including web development, scientific computing, data analysis, and machine learning.",
        ],
    }
)

with mlflow.start_run() as run:
    system_prompt = "Answer the following question in two sentences"
    logged_model = mlflow.openai.log_model(
        model="gpt-3.5-turbo",
        task=openai.ChatCompletion,
        artifact_path="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
    )

    results = mlflow.evaluate(
        logged_model.model_uri,
        eval_df,
        evaluators="default",
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            answer_similarity_metric,
            make_metric(
                eval_fn=custom_metric,
                greater_is_better=False,
            ),
        ],
    )
    print(results)

    eval_table = results.tables["eval_results_table"]
    print(eval_table)

 - mlflow (current: 2.6.1.dev0, required: mlflow==2.9.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/01/31 11:33:04 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/01/31 11:33:04 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/01/31 11:33:06 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:06 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:06 INFO mlflow.models.evaluation.default_evaluator: column name metrics
2024/01/31 11:33:06 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:06 INFO mlflow.models.evaluation.default_evaluator: column name inputs
2024/01/31 11:33:06 INFO mlflow.models.evaluation.defaul

  0%|          | 0/1 [00:00<?, ?it/s]

2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name targets
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name answer_similarity
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name targets
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default

answer sim MetricValue(scores=[4, None, None], justifications=["The model's output accurately describes MLflow as an open-source platform for managing the machine learning lifecycle, including deployment, tracking, and reproducibility of models. However, it does not mention that it was developed by Databricks or the challenges it addresses for data scientists and machine learning engineers, which are included in the target information.", None, None], aggregate_results={'mean': 4.0, 'variance': 0.0, 'p90': 4.0})


2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name targets
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name metrics
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator: column name targets
2024/01/31 11:33:19 INFO mlflow.models.evaluation.default_evaluator

  0%|          | 0/3 [00:00<?, ?it/s]

2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: column name targets
2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: column name predictions
2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')
2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: column name answer_similarity
2024/01/31 11:33:26 INFO mlflow.models.evaluation.default_evaluator: input df columns Index(['inputs'], dtype='object')


answer sim MetricValue(scores=[4, 4, 4], justifications=['The output effectively explains what MLflow is and its purpose, aligning with the provided targets in most aspects. However, it does not mention that MLflow was developed by Databricks, which is included in the target information.', "The model's output accurately describes Apache Spark as an open-source distributed computing system used for big data processing and analytics, which aligns well with the provided targets. However, it lacks details about Spark's components and its development history, which prevents it from achieving a perfect score.", 'The output effectively explains what Python is, its uses, and its features, aligning closely with the provided targets. However, it does not mention the creator and the year of release, which are included in the targets, hence it does not fully align in all significant aspects.'], aggregate_results={'mean': 4.0, 'variance': 0.0, 'p90': 4.0})
<mlflow.models.evaluation.base.EvaluationR

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

            inputs                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   
2  What is Python?  Python is a high-level programming language th...   

                                             outputs  token_count  \
0  MLflow is an open-source platform for the comp...           48   
1  Spark is an open-source distributed computing ...           35   
2  Python is a high-level, interpreted programmin...           53   

   toxicity/v1/score  flesch_kincaid_grade_level/v1/score  \
0           0.000138                                 13.1   
1           0.000139                                 12.9   
2           0.000139                                 16.8   

   ari_grade_level/v1/score  answer_similarity/v1/score  \
0                      17.6                           4   
1                      14.1                           4   
2       

In [2]:
import os

os.environ["OPENAI_API_KEY"] = "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"