In [None]:
#pip install openai

In [1]:
#pip install wandb

# LLM Evaluation and Tracing with W&B

<!--- @wandbcode{dlai_04} -->

## 1. Using Tables for Evaluation

In this section, we will call OpenAI LLM to generate names of Kingdoms. We will use W&B Tables to evaluate the generations. 

In [6]:
import os
import random
import time
import datetime

import openai

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential, # for exponential backoff
)
import wandb
from wandb.sdk.data_types.trace_tree import Trace

In [None]:
#pip install python-dotenv

#### Get openai API key


In [2]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
PROJECT = "llm_evaluation and debugging"
MODEL_NAME = "gpt-3.5-turbo"

In [7]:
wandb.login(anonymous="allow",relogin=True) ##3f1ee59462df44796372f2c8717640d164cb38780000###

[34m[1mwandb[0m: (1) Private W&B dashboard, no account required
[34m[1mwandb[0m: (2) Use an existing W&B account


wandb: Enter your choice: 1


[34m[1mwandb[0m: You chose 'Private W&B dashboard, no account required'
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\siddhanna.janai/.netrc


True

In [None]:
run = wandb.init(project=PROJECT, job_type="generation",reinit=True)

### Simple generations
Let's start by generating names for King's Kingdom using OpenAI `ChatCompletion`, and saving the resulting generations in W&B Tables. 


In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [None]:
def generate_and_print(system_prompt, user_prompt, table, n=1):
    messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    start_time = time.time()
    responses = completion_with_backoff(
        model=MODEL_NAME,
        messages=messages,
        n = n,
        )
    elapsed_time = time.time() - start_time
    for response in responses.choices:
        generation = response.message.content
        print(generation)
    table.add_data(system_prompt,
                user_prompt,
                [response.message.content for response in responses.choices],
                elapsed_time,
                datetime.datetime.fromtimestamp(responses.created),
                responses.model,
                responses.usage.prompt_tokens,
                responses.usage.completion_tokens,
                responses.usage.total_tokens
                )

In [None]:
system_prompt = """You are a creative copywriter on Indian Epics.
You're given with the name of king, \
and your goal is to design a name for the kingdom.\
Provide the 3 names of the kingdom only, no additional details required."""

In [None]:
# Define W&B Table to store generations
columns = ["system_prompt", "user_prompt", "generations", "elapsed_time", "timestamp",\
            "model", "prompt_tokens", "completion_tokens", "total_tokens"]
table = wandb.Table(columns=columns)

In [None]:
user_prompt = "Shri Krishna Devaraya"
generate_and_print(system_prompt, user_prompt, table)

1. KrishnaSagar
2. Devaranya
3. ShriRajya


In [None]:
user_prompt = " Krishna Raja Wadeyar IV "
generate_and_print(system_prompt, user_prompt, table)

1. Indrapuri
2. Amaravati
3. Mithilapura


In [None]:
wandb.log({"simple_generations": table})
run.finish()

VBox(children=(Label(value='0.003 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.775818…

## 2. Using Tracer to log more complex chains

How can we get more creative outputs? Let's design an LLM chain that will first randomly pick a  historic/imaginary world, and then generate Kingdom names. We will demonstrate how to use Tracer in such scenario. We will log the inputs and outputs, start and end times, whether the OpenAI call was successful, the token usage, and additional metadata.

In [None]:
worlds = [
    "an ancient city of India",
    "a historical village of Egypt civilization",
    "a modern city with AI humanoids "
]

In [None]:
# define your config
model_name = "gpt-3.5-turbo"
temperature = 0.7
system_message = """You are a creative copywriter.
You're given a king's name and historic/imaginary world he is from.
Your goal is design the kingdom's name.
Provide the resulting name only, no additional description.
Single name, max 3 words output, remember!"""

In [None]:
def run_creative_chain(query):
    # part 1 - a chain is started...
    start_time_ms = round(datetime.datetime.now().timestamp() * 1000)

    root_span = Trace(
          name="MyCreativeChain",
          kind="chain",
          start_time_ms=start_time_ms,
          metadata={"user": "student_1"},
          model_dict={"_kind": "CreativeChain"}
          )

    # part 2 - Chain picks a fantacy world
    time.sleep(3)
    world = random.choice(worlds)
    expanded_prompt = f'Name of the King: {query}; historic/imaginary world description: {world}'
    tool_end_time_ms = round(datetime.datetime.now().timestamp() * 1000)

    # create a Tool span
    tool_span = Trace(
          name="WorldPicker",
          kind="tool",
          status_code="success",
          start_time_ms=start_time_ms,
          end_time_ms=tool_end_time_ms,
          inputs={"input": query},
          outputs={"result": expanded_prompt},
          model_dict={"_kind": "tool", "num_worlds": len(worlds)}
          )

    # add the TOOL span as a child of the root
    root_span.add_child(tool_span)

    # part 3 - the LLMChain calls an OpenAI LLM...
    messages=[
      {"role": "system", "content": system_message},
      {"role": "user", "content": expanded_prompt}
    ]

    response = completion_with_backoff(model=model_name,
                                       messages=messages,
                                       max_tokens=12,
                                       temperature=temperature)

    llm_end_time_ms = round(datetime.datetime.now().timestamp() * 1000)
    response_text = response["choices"][0]["message"]["content"]
    token_usage = response["usage"].to_dict()

    llm_span = Trace(
          name="OpenAI",
          kind="llm",
          status_code="success",
          metadata={"temperature":temperature,
                    "token_usage": token_usage,
                    "model_name":model_name},
          start_time_ms=tool_end_time_ms,
          end_time_ms=llm_end_time_ms,
          inputs={"system_prompt":system_message, "query":expanded_prompt},
          outputs={"response": response_text},
          model_dict={"_kind": "Openai", "engine": response["model"], "model": response["object"]}
          )

    # add the LLM span as a child of the Chain span...
    root_span.add_child(llm_span)

    # update the end time of the Chain span
    root_span.add_inputs_and_outputs(
          inputs={"query":query},
          outputs={"response": response_text})

    # update the Chain span's end time
    root_span.end_time_ms = llm_end_time_ms


    # part 4 - log all spans to W&B by logging the root span
    root_span.log(name="creative_trace")
    print(f"Result: {response_text}")

In [None]:
# Let's start a new wandb run
wandb.init(project=PROJECT, job_type="generation")

In [None]:
run_creative_chain("Vikramadithya-1")

Result: Nilecrest


In [None]:
run_creative_chain("Dhuryodhana")

Result: Dhuryodhania


In [None]:
run_creative_chain("Raja Raja Chola")

Result: Cholavara


In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## 3. LangChain Agent

**Langchain agent**
In the third scenario, we'll introduce an agent that will use tools such as WorldPicker and NameValidator to come up with the ultimate name. We will also use Langchain here and demonstrate its W&B integration.

In [2]:
#pip install langchain

In [9]:
# Import things that are needed generically

from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool

from typing import Optional

from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)

In [None]:
wandb.init(project=PROJECT, job_type="generation")

In [None]:
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

In [None]:
class WorldPickerTool(BaseTool):
    name = "pick_world"
    description = "pick a historic/imaginary world for the king / item naming"
    worlds = [
                "Ancient India",
                "Egypt civilization",
                "A world full of AI humanoids "
            ]

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        time.sleep(1)
        return random.choice(self.worlds)

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("pick_world does not support async")

class NameValidatorTool(BaseTool):
    name = "validate_name"
    description = "validate if the name is properly generated"

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        time.sleep(1)
        if len(query) < 20:
            return f"This is a correct name: {query}"
        else:
            return f"This name is too long. It should be shorter than 20 characters."

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("validate_name does not support async")

In [10]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo-0613")

In [None]:
tools = [WorldPickerTool(), NameValidatorTool()]
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    verbose=True
)

In [None]:
agent.run(
    "Find a historic/imaginary world for me and imagine the name of a king in that world"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a historic/imaginary world first and then generate the name of a king in that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3mAncient India[0m
Thought:[32;1m[1;3mNow I need to generate the name of a king in Ancient India.
Action: validate_name
Action Input: "Rajendra"[0m
Observation: [33;1m[1;3mThis is a correct name: Rajendra[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: The name of the king in the historic/imaginary world of Ancient India is Rajendra.[0m

[1m> Finished chain.[0m


'The name of the king in the historic/imaginary world of Ancient India is Rajendra.'

In [None]:
agent.run(
    "Find a historic/imaginary world for me and imagine the name of a kingdom in that world"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a historic/imaginary world and then generate a name for a kingdom in that world.
Action: pick_world
Action Input: -[0m
Observation: [36;1m[1;3mA world full of AI humanoids [0m
Thought:[32;1m[1;3mI need to validate the generated kingdom name.
Action: validate_name
Action Input: Kingdom name: "Alpha Prime"[0m
Observation: [33;1m[1;3mThis name is too long. It should be shorter than 20 characters.[0m
Thought:[32;1m[1;3mI need to generate a shorter kingdom name.
Action: validate_name
Action Input: Kingdom name: "Prime"[0m
Observation: [33;1m[1;3mThis name is too long. It should be shorter than 20 characters.[0m
Thought:[32;1m[1;3mI need to generate a shorter kingdom name.
Action: validate_name
Action Input: Kingdom name: "Avalon"[0m
Observation: [33;1m[1;3mThis name is too long. It should be shorter than 20 characters.[0m
Thought:[32;1m[1;3mI need to generate a shorter kingdom name.
Action: v

"I couldn't generate a valid name for the kingdom in the world of AI humanoids."

In [None]:
agent.run("Find a historic/ imaginary world for me and imagine the name of a food recipe in that world")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a historic/imaginary world first and then come up with a name for a food recipe in that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3mEgypt civilization[0m
Thought:[32;1m[1;3mNow I need to come up with a name for a food recipe in Egypt civilization.
Action: validate_name
Action Input: "Pyramid Delight"[0m
Observation: [33;1m[1;3mThis is a correct name: Pyramid Delight[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: The name of a food recipe in the Egypt civilization is "Pyramid Delight"[0m

[1m> Finished chain.[0m


'The name of a food recipe in the Egypt civilization is "Pyramid Delight"'

In [None]:
agent.run("Find a historic/ imaginary world for me and imagine the name of a sport in that world")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a historic/imaginary world first and then come up with a sport name for that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3mA world full of AI humanoids [0m
Thought:[32;1m[1;3mNow I need to come up with a sport name for this AI humanoid world.
Action: validate_name
Action Input: "RoboBall"[0m
Observation: [33;1m[1;3mThis is a correct name: RoboBall[0m
Thought:[32;1m[1;3mI now know the final answer.
Final Answer: In the AI humanoid world, there is a sport called RoboBall.[0m

[1m> Finished chain.[0m


'In the AI humanoid world, there is a sport called RoboBall.'

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…