## IMPORTING THE NECESSARY PACKAGES

In [None]:
!pip install --quiet ipytest

In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform google-cloud-aiplatform[evaluation]

In [None]:
import pytest
import ipytest
from IPython.display import Markdown, display
from google import genai
from google.genai.types import GenerateContentConfig
from google.genai import types

## CREATING FUNCTIONS TO CREATE THE RELEVANT GEN AI RESPONSES

We first create the classification function

In [None]:
def classify_user_question(user_q):

  # Generating the client
  client = genai.Client(
      vertexai=True,
      project="qwiklabs-gcp-02-cf6490c204fb",
      location="us-central1",
  )

  # Specifying the model
  model = "gemini-2.5-pro-preview-05-06"

  # Defining the prompt based on the user's input
  prompt = f"""
  Classify the followwing question into one of the following categories: 'Employment','General Information','Emergency Services','Tax Related'

  Question: {user_q}
  """

  # Getting and returning our response
  response = client.models.generate_content(model=model, contents=prompt)
  return response.text

In [None]:
classify_user_question("What number do I use to call an ambulance?")

'**Category:** Emergency Services'

In [None]:
classify_user_question("When do I need to lodge a tax return?")

'**Tax Related**'

The below method creates the social media post

In [None]:
def generate_social_post(topic):

  # Generating the client
  client = genai.Client(
      vertexai=True,
      project="qwiklabs-gcp-02-cf6490c204fb",
      location="us-central1",
  )

  # Specifying the model
  model = "gemini-2.5-pro-preview-05-06"

  # Defining the prompt based on the user's input
  prompt = f"""
  Generate a social media post in 25 words or less based on the following topic: {topic}
  """

  # Getting and returning our response
  response = client.models.generate_content(model=model, contents=prompt)
  return response.text

In [None]:
generate_social_post("christmas")

'Merry Christmas! 🎄 Wishing you a day filled with joy, love, and festive cheer. #HappyHolidays'

## DEFINING THE TEST CASES

Test case for the classification problem

In [None]:
def test_classify_user_q():
  # Boolean we will assess our outputs against
  passed = True

  # Basic list of test cases
  reference_list = [("What number do I use to call an ambulance?","emergency services"),("When do I need to lodge a tax return?","Tax Related")]

  for example in reference_list:
    question = example[0]
    answer = example[1]

    response = classify_user_question(question)
    if answer.lower() not in response.lower():
      passed = False
      break

  assert passed == True

Test case for the social media post problem

In [None]:
def test_social_post():
  # Boolean we will assess our outputs against
  passed = True

  # Basic list of test cases
  reference_list = [("Christmas","Merry Christmas")]

  for example in reference_list:
    question = example[0]
    answer = example[1]

    response = generate_social_post(question)

    check_question = f"""
    Are these posts fundamentally the same? Answer with a yes or no

    post1: {response}
    post2: {answer}

    """

    # Generating the client
    client = genai.Client(
      vertexai=True,
      project="qwiklabs-gcp-02-cf6490c204fb",
      location="us-central1"
    )

    # Specifying the model
    model = "gemini-2.5-pro-preview-05-06"

    checker = client.models.generate_content(model=model, contents=check_question)

    if "yes" not in checker.text.lower():
      passed = False

  assert passed == True

## RUNNING PYTEST

In [None]:
import pytest
import ipytest
ipytest.autoconfig()
ipytest.run()

[32m.[0m[32m.[0m[33m                                                                                           [100%][0m
../usr/local/lib/python3.10/dist-packages/_pytest/config/__init__.py:1277
    self._mark_plugins_for_rewrite(hook)



<ExitCode.OK: 0>

## LEVERAGING THE GOOGLE EVALUATION API SERVICE

We begin with the evaluations of the classification questions

In [None]:
import pandas as pd

contexts = ["What number do I use to call an ambulance?","When do I need to lodge a tax return?"]

full_prompts = [base_prompt.format(i) for i in contexts]
content = [classify_user_question(i) for i in contexts]

eval_dataset = pd.DataFrame(
{
"response": content,
"context": full_prompts,
"instruction":full_prompts
}
)

In [None]:
eval_dataset.head()

Unnamed: 0,response,context,instruction
0,**Emergency Services**,\n Classify the following question into one o...,\n Classify the following question into one o...
1,**Tax Related**,\n Classify the following question into one o...,\n Classify the following question into one o...


In [None]:
import datetime
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate
)
import vertexai

eval_classify_users = EvalTask(
    dataset=eval_dataset,
    metrics=[MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING, #Check instruction following, required for instructions
        MetricPromptTemplateExamples.Pointwise.TEXT_QUALITY, #Ensure text is high quality and not random
        MetricPromptTemplateExamples.Pointwise.VERBOSITY,]
)

prompt_template = (
    "Instruction: {instruction}\n"
    "context: {context}\n"
    "response: {response}"
)
result = eval_classify_users.evaluate(prompt_template=prompt_template)

result.summary_metrics

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/654098195926/locations/us-central1/metadataStores/default/contexts/validate-classify-user-87094cc9-4d07-4607-87eb-7859a5dfb8d1 to Experiment: validate-classify-user


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}\ncontext: {context}\nresponse: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:05<00:00,  1.03it/s]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.831025990002672 seconds


{'row_count': 2,
 'instruction_following/mean': 5.0,
 'instruction_following/std': 0.0,
 'text_quality/mean': 5.0,
 'text_quality/std': 0.0,
 'verbosity/mean': 0.0,
 'verbosity/std': 0.0}

We now move on to evaluating the social media posts

In [None]:
import pandas as pd

contexts = ["christmas","snow day"]

full_prompts = [base_prompt.format(i) for i in contexts]
content = [generate_social_post(i) for i in contexts]

eval_dataset = pd.DataFrame(
{
"response": content,
"context": full_prompts,
"instruction":full_prompts
}
)

In [None]:
eval_dataset.head()

Unnamed: 0,response,context,instruction
0,Merry Christmas! 🎄 Wishing you a season filled...,\n Classify the following question into one o...,\n Classify the following question into one o...
1,SNOW DAY! ❄️ No school/work. Cozy vibes or sno...,\n Classify the following question into one o...,\n Classify the following question into one o...


In [96]:
import datetime
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate
)
import vertexai

eval_social_post = EvalTask(
    dataset=eval_dataset,
    metrics=[MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING, #Check instruction following, required for instructions
        MetricPromptTemplateExamples.Pointwise.TEXT_QUALITY, #Ensure text is high quality and not random
        MetricPromptTemplateExamples.Pointwise.VERBOSITY,]
)

prompt_template = (
    "Instruction: {instruction}\n"
    "context: {context}\n"
    "response: {response}"
)
result = eval_social_post.evaluate(prompt_template=prompt_template)

result.summary_metrics

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/654098195926/locations/us-central1/metadataStores/default/contexts/validate-classify-user-a57294f0-e57f-4de4-9d35-e03123d059a0 to Experiment: validate-classify-user


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}\ncontext: {context}\nresponse: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:05<00:00,  1.02it/s]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.92135071300072 seconds


{'row_count': 2,
 'instruction_following/mean': 1.0,
 'instruction_following/std': 0.0,
 'text_quality/mean': 1.0,
 'text_quality/std': 0.0,
 'verbosity/mean': -2.0,
 'verbosity/std': 0.0}