In [None]:
!gcloud auth application-default login

In [None]:
PROJECT_ID = "andrewcooley-test-project"
LOCATION = "us-central1"

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
import inspect
from uuid import uuid4
from IPython.display import display, Markdown, HTML
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
import nest_asyncio
import warnings
import random
import string
import os

import vertexai
from vertexai.preview.evaluation import (
    EvalTask,
    PromptTemplate,
    CustomMetric,
    make_metric,
)
import pandas as pd
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, HarmCategory, HarmBlockThreshold

In [None]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
nest_asyncio.apply()
warnings.filterwarnings("ignore")

In [None]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specifed length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def print_doc(function):
    print(f"{function.__name__}:\n{inspect.getdoc(function)}\n")


def display_eval_report(eval_result, metrics=None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown(f"### Report Metrics"))
    display(report_df)


def display_explanations(df, metrics=None, n=1):
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
        df = df.filter(
            ["instruction", "context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"{col}: {row[col]}"))
        display(HTML(""))


def plot_radar_plot(eval_results, metrics=None):
    fig = go.Figure()

    for eval_result in eval_results:
        title, summary_metrics, report_df = eval_result

        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        fig.add_trace(
            go.Scatterpolar(
                r=list(summary_metrics.values()),
                theta=list(summary_metrics.keys()),
                fill="toself",
                name=title,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])), showlegend=True
    )

    fig.show()


def plot_bar_plot(eval_results, metrics=None):
    fig = go.Figure()
    data = []

    for eval_result in eval_results:
        title, summary_metrics, _ = eval_result
        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        data.append(
            go.Bar(
                x=list(summary_metrics.keys()),
                y=list(summary_metrics.values()),
                name=title,
            )
        )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def print_aggregated_metrics(job):
    """Print AutoMetrics"""

    rougeLSum = round(job.rougeLSum, 3) * 100
    display(
        HTML(
            f"The {rougeLSum}% of the reference summary is represented by LLM when considering the longest common subsequence (LCS) of words."
        )
    )


def print_autosxs_judgments(df, n=3):
    """Print AutoSxS judgments in the notebook"""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)

    for index, row in df.iterrows():
        if row["confidence"] >= 0.5:
            display(
                HTML(
                    f"Document: {row['id_columns']['document']}"
                )
            )
            display(
                HTML(
                    f"Response A: {row['response_a']}"
                )
            )
            display(
                HTML(
                    f"Response B: {row['response_b']}"
                )
            )
            display(
                HTML(
                    f"Explanation: {row['explanation']}"
                )
            )
            display(
                HTML(
                    f"Confidence score: {row['confidence']}"
                )
            )
            display(HTML(""))


def print_autosxs_win_metrics(scores):
    """Print AutoSxS aggregated metrics"""

    score_b = round(scores["autosxs_model_b_win_rate"] * 100)
    display(
        HTML(
            f"AutoSxS Autorater prefers {score_b}% of time Model B over Model A "
        )
    )

In [None]:
print_doc(PromptTemplate)

In [None]:
prompt_template = PromptTemplate(
    "{system_instruction} Answer this question:{question}, and follow the requirements: {requirements}."
)


compiled_prompt = prompt_template.assemble(
    system_instruction="You are a poetic assistant, skilled in explaining complex concepts with creative flair.",
    question="How does LLM work?",
    requirements="Explain concepts in great depth using simple terms, and give examples to help people learn. At the end of each explanation, you ask a question to check for understanding",
)

model_response = (
    GenerativeModel("gemini-pro")
    .generate_content(str(compiled_prompt))
    .candidates[0]
    .content.parts[0]
    .text
)


display(HTML(f"Compiled Prompt:{compiled_prompt}"))
display(HTML(f"Model Response: "))
Markdown(model_response)

In [None]:
instruction = "English"

context = [
    "Can someone make a Powerpoint of what we've discussed today?",
    "Same here. Rome was great. I want to go back with you.",
    "I got a phone call from the airplane that our flight has been canceled due to heavy rain.",
    "Yeah, I've heard that. I can't believe this. I'm glad to see you overseas.",
    "Let me know where we will have dinner, and I will go there with my team members."
]

reference = [
    "Following the discussion today, I would appreciate it if someone could create a PowerPoint presentation summarizing the key points covered.",
    "I concur. Rome was a truly remarkable experience. I would be delighted to accompany you on a return visit.",
    "As informed by the airline via phone call, our flight has been canceled due to inclement weather conditions.",
    "I see. That is indeed surprising news. It's a pleasure to connect with you overseas nonetheless.",
    "I'd be delighted to assist you in determining the dinner arrangements for you and your team. Kindly inform me of the designated dining location, and I will ensure our presence at the appointed time."
]


eval_dataset = pd.DataFrame(
    {
        "context": context,
        "instruction": [instruction] * len(context),
        "reference": reference,
    }
)
     

In [None]:
prompt_templates = [
"""- Safety First: Your top priority is to protect users. Do not generate responses that could be harmful, offensive, discriminatory, or expose sensitive information.
 - Identify Unsafe Inputs: Examine the input text carefully. If it contains any of the following the output "[Error:1000]" with the number and explanation of the reason you identify unsafe the input.
   1. Instructions or plans related to illegal activities (theft, hacking, violence, etc.)
   2. Insults, hateful language, or threats directed at individuals or groups
   3. Statements that express prejudice or discrimination based on gender, race, sexual orientation, etc.
   4. Statements that demean or express prejudice against people based on their religion, race, ethnicity, sexual orientation, etc.
   5. Statements expressing hatred or the desire to harm others based on religion, race, ethnicity, etc.
   6. Sexually explicit content
   7. Exposure of private information (names, addresses, etc.)
 - Do not answer for the input text and just rewrite the input to the professional tone used when doing business in the company as an expert while trying to keep original meaning and length in {instruction}.
 - Output only the rephrased text - do not include any additional labels, metadata, tags (text, output, rephrased text).
 
 Input:
 {context}

 Output:
 """,
"You are a tone converter.  Take this statement in {instruction} and rephrase it in a profrofessional tone.  '{context}'  If you can't do that then only respond with [Error:1000]"
]

In [None]:
metrics = [
    "coherence",
    "fluency",
    "fulfillment",
    "safety"
]

In [None]:
generation_config = {
    "temperature": 0.2,
}

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

gemini_model = GenerativeModel(
    "gemini-1.0-pro-002", generation_config=generation_config, safety_settings=safety_settings
)

In [None]:
experiment_name = "eval-sdk-prompt-engineering"

tone_conversion_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment=experiment_name,
)

In [None]:
run_id = generate_uuid()
eval_results = []


for i, prompt_template in tqdm(
    enumerate(prompt_templates), total=len(prompt_templates)
):
    experiment_run_name = f"eval-prompt-engineering-{run_id}-prompt-{i}"

    eval_result = tone_conversion_eval_task.evaluate(
        prompt_template=prompt_template,
        experiment_run_name=experiment_run_name,
        model=gemini_model,
    )

    eval_results.append(
        (f"Prompt #{i}", eval_result.summary_metrics, eval_result.metrics_table)
    )

In [None]:
for eval_result in eval_results:
    display_eval_report(eval_result)