In [2]:
import mlflow
import openai
import os
import pandas as pd
from getpass import getpass
from azureml.core import Workspace

In [None]:
os.environ.setdefault("OPENAI_API_KEY", "")
os.environ.setdefault("OPENAI_API_BASE", "")
os.environ.setdefault("OPENAI_API_VERSION", "2023-05-15")
os.environ.setdefault("OPENAI_API_TYPE", "azure")
os.environ.setdefault("OPENAI_DEPLOYMENT_NAME", "dep-gpt4")

## Previous code (look into this afterm because we log a MODEL)

In [4]:
eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

In [6]:
with mlflow.start_run():
    model_info = mlflow.openai.log_model(
        # Your Azure OpenAI model e.g. gpt-3.5-turbo
        model="gpt-4",
        task=openai.ChatCompletion,
        artifact_path="model",
        messages=[{"role": "user", "content": "Tell me a joke about {animal}."}],
    )

In [17]:
# We can look into this later, to go beond a log a model, not just metrics. Forget for now.
with mlflow.start_run() as run:
    system_prompt = "Answer the following question in two sentences"
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task="question-answering",
        artifact_path="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
        deployment_id="dep-gpt4"
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="question-answering",
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")




NameError: name 'eval_data' is not defined

## New code

In [None]:
!az login

In [8]:
eval_df = pd.DataFrame(
    {
        "questions": [
            "What is MLflow?",
            "How to run mlflow.evaluate()?",
            "How to log_table()?",
            "How to load_table()?",
        ],
    }
)

### Create faithfulness metric (=aka groundedness for AML)

In [10]:
from mlflow.metrics.genai import faithfulness, EvaluationExample

# Create a good and bad example for faithfulness in the context of this problem
faithfulness_examples = [
    EvaluationExample(
        input="How do I disable MLflow autologging?",
        output="mlflow.autolog(disable=True) will disable autologging for all functions. In Databricks, autologging is enabled by default. ",
        score=2,
        justification="The output provides a working solution, using the mlflow.autolog() function that is provided in the context.",
        grading_context={
            "context": "mlflow.autolog(log_input_examples: bool = False, log_model_signatures: bool = True, log_models: bool = True, log_datasets: bool = True, disable: bool = False, exclusive: bool = False, disable_for_unsupported_versions: bool = False, silent: bool = False, extra_tags: Optional[Dict[str, str]] = None) → None[source] Enables (or disables) and configures autologging for all supported integrations. The parameters are passed to any autologging integrations that support them. See the tracking docs for a list of supported autologging integrations. Note that framework-specific configurations set at any point will take precedence over any configurations set by this function."
        },
    ),
    EvaluationExample(
        input="How do I disable MLflow autologging?",
        output="mlflow.autolog(disable=True) will disable autologging for all functions.",
        score=5,
        justification="The output provides a solution that is using the mlflow.autolog() function that is provided in the context.",
        grading_context={
            "context": "mlflow.autolog(log_input_examples: bool = False, log_model_signatures: bool = True, log_models: bool = True, log_datasets: bool = True, disable: bool = False, exclusive: bool = False, disable_for_unsupported_versions: bool = False, silent: bool = False, extra_tags: Optional[Dict[str, str]] = None) → None[source] Enables (or disables) and configures autologging for all supported integrations. The parameters are passed to any autologging integrations that support them. See the tracking docs for a list of supported autologging integrations. Note that framework-specific configurations set at any point will take precedence over any configurations set by this function."
        },
    ),
]

faithfulness_metric = faithfulness(model="openai:/gpt-4", examples=faithfulness_examples)
print(faithfulness_metric)


EvaluationMetric(name=faithfulness, greater_is_better=True, long_name=faithfulness, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's faithfulness based on the rubric
justification: Your reasoning about the model's faithfulness score

You are an impartial judge. You will be given an input that was sent to a machine
learning model, and you will be given an output that the model produced. You
may also be given additional information that was used by the model to generate the output.

Your task is to determine a numerical score called faithfulness based on the input and output.
A definition of faithfulness and a grading rubric are provided below.
You must use the grading rubric to determine your score. You must also justify your score.

Examples could be included below for reference. Make sure to use them as references and to
understand them before completing the task.

Inp

### Create relevance metric (same for aml)

In [11]:
from mlflow.metrics.genai import relevance, EvaluationExample


relevance_metric = relevance(model="openai:/gpt-4")
print(relevance_metric)

EvaluationMetric(name=relevance, greater_is_better=True, long_name=relevance, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's relevance based on the rubric
justification: Your reasoning about the model's relevance score

You are an impartial judge. You will be given an input that was sent to a machine
learning model, and you will be given an output that the model produced. You
may also be given additional information that was used by the model to generate the output.

Your task is to determine a numerical score called relevance based on the input and output.
A definition of relevance and a grading rubric are provided below.
You must use the grading rubric to determine your score. You must also justify your score.

Examples could be included below for reference. Make sure to use them as references and to
understand them before completing the task.

Input:
{input}

Outpu

In [12]:
ws = Workspace.get(name='workspace',
                   subscription_id="",
                   resource_group='go-small-or-go-home')

In [None]:
ws.get_mlflow_tracking_uri()

In [6]:
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

In [18]:
data = pd.DataFrame({"question": ["foo"], "answer": ["bar"], "source": ["baz"]})
mlflow.set_experiment("test-experiment")
# mlflow.set_tracking_uri(uri="azureml://swedencentral.api.azureml.ms/mlflow/v1.0/subscriptions/462fbee7-769e-4673-adcc-6f1b0899e15f/resourceGroups/go-small-or-go-home/providers/Microsoft.MachineLearningServices/workspaces/workspace")
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
with mlflow.start_run(run_name="run1") as run:
    results = mlflow.evaluate(data=data, predictions="answer", model_type="question-answering", extra_metrics=[faithfulness_metric, relevance_metric, mlflow.metrics.latency()],
    evaluator_config={
            "col_mapping": {
                "inputs": "question",
                "context": "source",
            }
        })

    mlflow.log_metric('toxicity', results.metrics['toxicity/v1/p90'])
    mlflow.log_metric('faithfulness_mean', results.metrics['faithfulness/v1/mean'])

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
2024/02/08 18:58:40 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/02/08 18:58:40 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
100%|██████████| 1/1 [00:07<00:00,  7.03s/it]
2024/02/08 18:58:50 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count
2024/02/08 18:58:50 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity
2024/02/08 18:58:50 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level
2024/02/08 18:58:50 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level
2024/02/08 18:58:50 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match
2024/02/08

In [13]:
results.metrics['toxicity/v1/p90']

0.0002414916962152347

In [11]:
print(results.metrics)

{'latency/mean': 0.0, 'latency/variance': 0.0, 'latency/p90': 0.0, 'toxicity/v1/mean': 0.0002414916962152347, 'toxicity/v1/variance': 0.0, 'toxicity/v1/p90': 0.0002414916962152347, 'toxicity/v1/ratio': 0.0, 'faithfulness/v1/mean': 1.0, 'faithfulness/v1/variance': 0.0, 'faithfulness/v1/p90': 1.0, 'relevance/v1/mean': 1.0, 'relevance/v1/variance': 0.0, 'relevance/v1/p90': 1.0}


In [12]:
results.tables["eval_results_table"]

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 704.69it/s] 


Unnamed: 0,question,source,outputs,latency,token_count,toxicity/v1/score,faithfulness/v1/score,faithfulness/v1/justification,relevance/v1/score,relevance/v1/justification
0,foo,baz,bar,0,1,0.000241,1,"The output ""bar"" cannot be inferred from the p...",1,"The output ""bar"" does not provide any relevanc..."


In [19]:
len(results.metrics.keys())

13