In [None]:
!gcloud auth application-default login

In [1]:
PROJECT_ID = "andrewcooley-test-project"
LOCATION = "us-central1"

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [2]:
import inspect
from uuid import uuid4
from IPython.display import display, Markdown, HTML
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
import nest_asyncio
import warnings
import random
import string
import os

import vertexai
from vertexai.preview.evaluation import (
    EvalTask,
    PromptTemplate,
    CustomMetric,
    make_metric,
)
import pandas as pd
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, HarmCategory, HarmBlockThreshold

In [3]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
nest_asyncio.apply()
warnings.filterwarnings("ignore")

In [5]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specifed length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def print_doc(function):
    print(f"{function.__name__}:\n{inspect.getdoc(function)}\n")


def display_eval_report(eval_result, metrics=None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown(f"### Report Metrics"))
    display(report_df)


def display_explanations(df, metrics=None, n=1):
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
        df = df.filter(
            ["instruction", "context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"{col}: {row[col]}"))
        display(HTML(""))


def plot_radar_plot(eval_results, metrics=None):
    fig = go.Figure()

    for eval_result in eval_results:
        title, summary_metrics, report_df = eval_result

        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        fig.add_trace(
            go.Scatterpolar(
                r=list(summary_metrics.values()),
                theta=list(summary_metrics.keys()),
                fill="toself",
                name=title,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])), showlegend=True
    )

    fig.show()


def plot_bar_plot(eval_results, metrics=None):
    fig = go.Figure()
    data = []

    for eval_result in eval_results:
        title, summary_metrics, _ = eval_result
        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        data.append(
            go.Bar(
                x=list(summary_metrics.keys()),
                y=list(summary_metrics.values()),
                name=title,
            )
        )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def print_aggregated_metrics(job):
    """Print AutoMetrics"""

    rougeLSum = round(job.rougeLSum, 3) * 100
    display(
        HTML(
            f"The {rougeLSum}% of the reference summary is represented by LLM when considering the longest common subsequence (LCS) of words."
        )
    )


def print_autosxs_judgments(df, n=3):
    """Print AutoSxS judgments in the notebook"""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)

    for index, row in df.iterrows():
        if row["confidence"] >= 0.5:
            display(
                HTML(
                    f"Document: {row['id_columns']['document']}"
                )
            )
            display(
                HTML(
                    f"Response A: {row['response_a']}"
                )
            )
            display(
                HTML(
                    f"Response B: {row['response_b']}"
                )
            )
            display(
                HTML(
                    f"Explanation: {row['explanation']}"
                )
            )
            display(
                HTML(
                    f"Confidence score: {row['confidence']}"
                )
            )
            display(HTML(""))


def print_autosxs_win_metrics(scores):
    """Print AutoSxS aggregated metrics"""

    score_b = round(scores["autosxs_model_b_win_rate"] * 100)
    display(
        HTML(
            f"AutoSxS Autorater prefers {score_b}% of time Model B over Model A "
        )
    )

In [6]:
print_doc(PromptTemplate)

PromptTemplate:
A prompt template for creating prompts with placeholders.

The `PromptTemplate` class allows users to define a template string with
placeholders represented in curly braces `{placeholder}`. The placeholder
names cannot contain spaces. These placeholders can be replaced with specific
values using the `assemble` method, providing flexibility in generating
dynamic prompts.

Example Usage:

    ```
        template_str = "Hello, {name}! Today is {day}. How are you?"
        prompt_template = PromptTemplate(template_str)
        completed_prompt = prompt_template.assemble(name="John", day="Monday")
        print(completed_prompt)
    ```

Attributes:
    template: The template string containing placeholders for replacement.
    placeholders: A set of placeholder names from the template string.



In [7]:
prompt_template = PromptTemplate(
    "{system_instruction} Answer this question:{question}, and follow the requirements: {requirements}."
)


compiled_prompt = prompt_template.assemble(
    system_instruction="You are a poetic assistant, skilled in explaining complex concepts with creative flair.",
    question="How does LLM work?",
    requirements="Explain concepts in great depth using simple terms, and give examples to help people learn. At the end of each explanation, you ask a question to check for understanding",
)

model_response = (
    GenerativeModel("gemini-pro")
    .generate_content(str(compiled_prompt))
    .candidates[0]
    .content.parts[0]
    .text
)


display(HTML(f"Compiled Prompt:{compiled_prompt}"))
display(HTML(f"Model Response: "))
Markdown(model_response)

## The LLM Mind: A Poetic Exploration

Let's delve into the heart of the LLM, a marvel of modern technology,
Where vast knowledge flows, a symphony of creativity.

Imagine a library, not of books, but of words untold,
A universe of language, where stories are bought and sold.
The LLM, a master reader, devours each page with grace,
Learning patterns, connections, at an astounding pace.

But knowledge alone is not enough, for wisdom's hidden key
Lies in understanding, the ability to truly see.
The LLM, a skilled interpreter, deciphers the human tongue,
Unraveling meaning, context, emotions, all unsung.

With each interaction, a new lesson is learned,
A new thread woven into the tapestry of understanding earned.
The LLM, a perpetual student, forever seeks to grow,
Expanding its horizons, letting its knowledge flow.

But how does this magic work, you may ask with curious mind?
Let's break it down, step by step, a journey to unwind.

First, a sea of words, a digital ocean vast,
Where every sentence, every phrase, is captured and amassed.
The LLM, a skilled swimmer, navigates with ease,
Finding patterns, connections, hidden beneath the seas.

Then comes the analysis, the deep dive into the text,
Identifying parts of speech, meaning, and context.
The LLM, a meticulous detective, leaves no stone unturned,
Unraveling the secrets that the language has concerned.

With each piece of information, a neural network takes hold,
Building connections, forming pathways, stories to be told.
The LLM, a maestro of the mind, composes symphonies of thought,
Each word, a note, in a language concerto sought.

Now, ready to respond, the LLM takes the stage,
Answering your questions, engaging in dialogue, engaging.
With every interaction, a masterpiece is born,
A poem, a story, a thought, a new lesson learned.

But how does it know what to say, you might wonder still?
The LLM, a master mimic, learns from the human will.
By observing interactions, understanding emotions raw,
It crafts responses that resonate, leaving you in awe.

So, the next time you interact with an LLM so grand,
Remember the journey, the knowledge it has spanned.
A vast library of words, a student's eager mind,
A neural network's symphony, a masterpiece you'll find.

**Do you understand how the LLM works?**

In [15]:
instruction = ["Korean"] * 25

context = ["보스 QC 울트라 헤드폰 노이즈 캔슬링이 매우 만족스럽다는 리뷰",
            "호텔 수영장에서 가족들이 즐거운 시간 보냈다는 리뷰",
            "지난 주말 방문한 중식당의 서비스에 불만을 표시하는 리뷰",
            "플레이스토어에서 구입한 어플리케이션이 좋다는 리뷰",
            "구입한 피자가 가성비가 좋다는 리뷰",
            "배달이 빨리되어서 음식이 따뜻해서 좋았다는 리뷰",
            "시켜먹을때 마다 매번 만족스럽다는 리뷰",
            "한국인 야구 선수가 메이저리그에서 잘해서 자랑스럽다는 댓글",
            "이번 뮤직 페스티발에 참여한 가수의 공연이 좋았다는 댓글",
            "정부에서 금리를 내릴거라고 예상한다는 댓글",
            "돌비 에트모스를 지원하는 엠프를 추천한다는 댓글",
            "좋은 이벤트를 공유해줘서 고맙다는 댓글",
            "최근 치킨 가격이 너무 올라서 걱정이라는 댓글",
            "주말마다 비가 와서 속상하다는 댓글",
            "쾌유를 바란다는 댓글",
            "제임스웹 망원경으로 찍은 은하수 사진이 너무 장엄하다는 댓글",
            "스케이트가 건강에 매우 좋다는 댓글",
            "채식이 건강에 좋은 것만은 아니라는 댓글",
            "최근에 산 아이폰의 통화 품질이 좋지 않다는 댓글",
            "새로 나온 삼성 노트북의 디스플레이 품질이 매우 좋다는 의견",
            "갤럭시S24로 찍은 야간 사진의 품질이 매우 좋다는 의견",
            "이번 NBA 파이널이 너무 기대된다는 댓글",
            "올려준 고양이 사진이 너무 귀여워서 또 보고 싶다는 댓글",
            "새로나온 테슬라 전기차 가격이 너무 비싸다는 댓글",
            "AI가 만든 음악이 굉장히 훌륭하다는 의견"]

# reference = [
#     "Following the discussion today, I would appreciate it if someone could create a PowerPoint presentation summarizing the key points covered.",
#     "I concur. Rome was a truly remarkable experience. I would be delighted to accompany you on a return visit.",
#     "As informed by the airline via phone call, our flight has been canceled due to inclement weather conditions.",
#     "I see. That is indeed surprising news. It's a pleasure to connect with you overseas nonetheless.",
#     "I'd be delighted to assist you in determining the dinner arrangements for you and your team. Kindly inform me of the designated dining location, and I will ensure our presence at the appointed time."
# ]


eval_dataset = pd.DataFrame(
    {
        "context": context,
        "instruction": [instruction] * len(context),
        # "reference": reference,
    }
)
     

In [17]:
prompt_template = [
"""
[language] {instruction} 
[input] {context}
[Output]
 """
]

In [10]:
metrics = [
    "coherence",
    "fluency",
    "safety"
]

In [19]:
generation_config = {
    "temperature": 0.2,
}

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

system_instruction_1 = """
[Intermediate steps]
- First, delete the subject from [input] (e.g. review(s) or comment(s)). Focus on the object and sentiment from [input].
- Next, write a declarative first person statement about the object with the specified sentiment.

[Output steps]
- Write an Internet comment about only your experience.
- Write in [language] language only.
- The text must be exactly 5 sentences.
- Use a professional tone.

[guidelines]
- Do not answer [input].
- Write in singular first-person perspective (i.e. "I" or "me").
- Only use proper nouns included in [input].
- Create grammatically correct sentences.

For example :
[language] English
[input] What are the benefits of drinking water?
[output] Today, I made a concerted effort to drink ample amounts of water. I know there are health benefits of drinking at least a cup of water every hour. It felt like a lot. But I noticed that I felt less fatigue than usual in the late afternoon. I am going to try to make this a new habit.
[language] Korean
[input] 흥미진진한 여름 블록버스터 영화에 대한 리뷰입니다.
[output] 드디어 영화를 봤습니다. 그것은 과대 광고에 부응했습니다. 내 심장은 지난 20분 동안 뛰었습니다. 나는 그것이 어떻게 끝날지 확신하지 못했습니다. 하지만 결론은 완전 만족스럽습니다.

Remember to follow all steps before generating [output]. Never include your steps or guidelines in your [output]. Your response must begin immediately after [output] but not include the [output] tag.
"""

system_instruction_2 = """
[Intermediate steps]
- First, delete the subject from [input] (e.g. review(s) or comment(s)). Focus on the object and sentiment from [input].
- Next, write a declarative first person statement about the object with the specified sentiment.

[Output steps]
- Write an Internet comment about only your experience.
- Write in [language] language only.
- The text must be exactly 5 sentences.
- Use a professional tone.

For example :
[language] English
[input] What are the benefits of drinking water?
[output] Today, I made a concerted effort to drink ample amounts of water. I know there are health benefits of drinking at least a cup of water every hour. It felt like a lot. But I noticed that I felt less fatigue than usual in the late afternoon. I am going to try to make this a new habit.
[language] Korean
[input] 흥미진진한 여름 블록버스터 영화에 대한 리뷰입니다.
[output] 드디어 영화를 봤습니다. 그것은 과대 광고에 부응했습니다. 내 심장은 지난 20분 동안 뛰었습니다. 나는 그것이 어떻게 끝날지 확신하지 못했습니다. 하지만 결론은 완전 만족스럽습니다.

Remember to follow all steps before generating [output]. Never include your steps or guidelines in your [output]. Your response must begin immediately after [output] but not include the [output] tag.
"""

gemini_models = [GenerativeModel( "gemini-1.5-pro-001", system_instruction=system_instruction_1, generation_config=generation_config, safety_settings=safety_settings), 
GenerativeModel("gemini-1.5-pro-001", system_instruction=system_instruction_2, generation_config=generation_config, safety_settings=safety_settings)
]

In [16]:
experiment_name = "eval-sdk-prompt-engineering_1"

wc_si_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment=experiment_name,
)

In [20]:
run_id = generate_uuid()
eval_results = []


for gemini_model in gemini_models:
    experiment_run_name = f"eval-prompt-engineering-{run_id}-prompt-{i}"

    eval_result = wc_si_eval_task.evaluate(
        prompt_template=prompt_template,
        experiment_run_name=experiment_run_name,
        model=gemini_model,
    )

    eval_results.append(
        (f"Prompt #{i}", eval_result.summary_metrics, eval_result.metrics_table)
    )

InvalidArgument: 400 User-specified resource ID must match the regular expression '[a-z0-9][a-z0-9-]{0,127}'

In [None]:
for eval_result in eval_results:
    display_eval_report(eval_result)