In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import asyncio
import csv
from enum import Enum

from loguru import logger
from pydantic import BaseModel

from taskllm.optimizer.data import DataSet, Row
from taskllm.optimizer.methods import BanditTrainer, BayesianTrainer
from taskllm.optimizer.prompt.meta import PromptMode

# logger.remove()  # remove the old handler. Else, the old one will work (and continue printing DEBUG logs) along with the new handler added below'
# logger.add(sys.stdout, level="TRACE")  # add a new handler which has INFO as the default


class Ratings(Enum):
    ONE = "1"
    TWO = "2"
    THREE = "3"
    FOUR = "4"
    FIVE = "5"
    NA = "N/A"


class StarbucksReviewRating(BaseModel):
    rating: Ratings


def sentiment_scoring_function(
    row: Row[StarbucksReviewRating], output: StarbucksReviewRating | None
) -> float:
    if output is None:
        return -10
    if not row.expected_output:
        return 0
    logger.trace(f"Expected: {row.expected_output.rating}, Output: {output.rating}")
    if row.expected_output.rating == output.rating:
        return 1

    return 0


def load_file_as_dataset(path: str) -> DataSet:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            rating = StarbucksReviewRating(rating=Ratings(row["Rating"]))
            rows.append(
                Row.create(
                    input_dictionary={
                        "review": row["Review"],
                        "name": row["name"],
                        "location": row["location"],
                        "date": row["Date"],
                    },
                    output=rating,
                )
            )
    return DataSet(rows=rows[:50], name="starbucks_reviews")



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
csv_path = "./starbucks_reviews.csv"
dataset = load_file_as_dataset(csv_path)
trainer = BayesianTrainer(
    all_rows=dataset,
    task_guidance="determine the rating of this review",
    keys=["review", "name", "location", "date"],
    expected_output_type=StarbucksReviewRating,
    scoring_function=sentiment_scoring_function,
    num_iterations=2,  # Start with fewer iterations for testing
    candidates_per_iteration=2,  # Start with fewer candidates for testing
    prompt_mode=PromptMode.SIMPLE,
    models=[
        "anthropic/claude-3-haiku-20240307",
        "openai/gpt-4.1-nano-2025-04-14",
        "openai/gpt-4.1-mini-2025-04-14",
        "groq/gemma2-9b-it",
        "groq/qwen-qwq-32b"
    ],
)

[32m2025-05-12 20:57:11.870[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36m__init__[0m:[36m208[0m - [1mUsing CPU for Pyro/Torch computations[0m
[32m2025-05-12 20:57:11.871[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36m__init__[0m:[36m242[0m - [1mAll rows: 50[0m
[32m2025-05-12 20:57:11.871[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36m__init__[0m:[36m721[0m - [34m[1mBayesianTrainer initialized[0m


In [4]:
await trainer.train()

[32m2025-05-12 20:57:11.899[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m725[0m - [1mStarting Bayesian optimization with Pyro: 2 iterations, 2 candidates/iter.[0m
[32m2025-05-12 20:57:11.899[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m731[0m - [1mPhase 1: Generating and evaluating initial candidates...[0m
[32m2025-05-12 20:57:11.900[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Use plain language in the prompt you write...[0m
[32m2025-05-12 20:57:14.950[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Make the prompt you write as simple as possible...[0m


You are a review evaluator. Your task is to analyze the review provided and assign a star rating based on its content. Take into account the tone, details, and overall sentiment expressed in the review.

Here are the details of the review:
- Review: {{ review }}
- Name of the reviewer: {{ name }}
- Location of the reviewer: {{ location }}
- Date of the review: {{ date }}

After your analysis, please clearly state the star rating from 1 to 5, with 1 being poor and 5 being excellent.


[32m2025-05-12 20:57:16.321[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m754[0m - [1mGenerated 2 initial candidate prompts.[0m
[32m2025-05-12 20:57:16.323[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m280[0m - [1mRunning for prompt You are a review evaluator. Your task is to analyze the review provided and assign a star rating bas... against 40 rows[0m
[32m2025-05-12 20:57:16.325[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m55[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4.1-nano-2025-04-14[0m
[32m2025-05-12 20:57:16.326[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m280[0m - [1mRunning for prompt You are a review rating expert. Analyze the review provided by the user and assign it a rating betwe... against 40 rows[0m
[32m2025-05-12 20:57:16.326[0m | [34m[1mDEBUG   

You are a review rating expert. Analyze the review provided by the user and assign it a rating between 1 and 5 based on its content. Consider the tone, content quality, and overall impression. The review details are as follows: Review: {{ review }}; Name: {{ name }}; Location: {{ location }}; Date: {{ date }}.


[32m2025-05-12 20:57:56.674[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m766[0m - [1mFitting initial surrogate model[0m
[32m2025-05-12 20:57:56.675[0m | [32m[1mSUCCESS [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m267[0m - [32m[1mExtracted features for 2 prompts[0m
[32m2025-05-12 20:57:56.676[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m55[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4.1-nano-2025-04-14[0m
[32m2025-05-12 20:57:56.677[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m55[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4.1-nano-2025-04-14[0m
[32m2025-05-12 20:57:56.714[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m72[0m - [34m[1mCalculated 40 scores[0m
[32m2025-05-12 20:57:56.715[0m | [1mINFO    

You are a review analysis expert. Read the review provided below and determine its rating on a scale from 1 to 5. 

Review: {{ review }}
Reviewer Name: {{ name }}
Location: {{ location }}
Date: {{ date }}

Provide a clear rating based on the content and tone of the review.


[32m2025-05-12 20:58:01.322[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m802[0m - [1mGenerated 2 new candidates for evaluation.[0m
[32m2025-05-12 20:58:01.323[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m280[0m - [1mRunning for prompt You are a review analysis expert. Read the review provided below and determine its rating on a scale... against 40 rows[0m
[32m2025-05-12 20:58:01.324[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m55[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4o-mini[0m
[32m2025-05-12 20:58:01.326[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m280[0m - [1mRunning for prompt As a rating assessor, read the review provided and evaluate its sentiment. Based on the content, det... against 40 rows[0m
[32m2025-05-12 20:58:01.326[0m | [34m[1mDEBUG   [0m | [

As a rating assessor, read the review provided and evaluate its sentiment. Based on the content, determine a rating from 1 to 5, where 1 is the lowest and 5 is the highest. Provide your rating along with a brief explanation of your reasoning.

Review: {{ review }}
Reviewer Name: {{ name }}
Location: {{ location }}
Date of Review: {{ date }}


[32m2025-05-12 20:58:36.318[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m813[0m - [1mPrompt content: You are a review analysis expert. Read the review provided below and determine its rating on a scale...[0m
[32m2025-05-12 20:58:36.319[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m813[0m - [1mPrompt content: As a rating assessor, read the review provided and evaluate its sentiment. Based on the content, det...[0m
[32m2025-05-12 20:58:36.320[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m818[0m - [1mUpdating surrogate model with new data[0m
[32m2025-05-12 20:58:36.321[0m | [32m[1mSUCCESS [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m267[0m - [32m[1mExtracted features for 4 prompts[0m
[32m2025-05-12 20:58:36.322[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m: