In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv
from enum import Enum

from loguru import logger
from pydantic import BaseModel

from taskllm.optimizer.data import DataSet, Row
from taskllm.optimizer.methods import BayesianTrainer
from taskllm.optimizer.prompt.meta import PromptMode

# logger.remove()  # remove the old handler. Else, the old one will work (and continue printing DEBUG logs) along with the new handler added below'
# logger.add(sys.stdout, level="TRACE")  # add a new handler which has INFO as the default


class Ratings(Enum):
    ONE = "1"
    TWO = "2"
    THREE = "3"
    FOUR = "4"
    FIVE = "5"
    NA = "N/A"


class StarbucksReviewRating(BaseModel):
    rating: Ratings


def sentiment_scoring_function(
    row: Row[StarbucksReviewRating], output: StarbucksReviewRating | None
) -> float:
    if output is None:
        return -10
    if not row.expected_output:
        return 0
    logger.trace(f"Expected: {row.expected_output.rating}, Output: {output.rating}")
    if row.expected_output.rating == output.rating:
        return 1

    return 0


def load_file_as_dataset(path: str) -> DataSet:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            rating = StarbucksReviewRating(rating=Ratings(row["Rating"]))
            rows.append(
                Row.create(
                    input_dictionary={
                        "review": row["Review"],
                        "name": row["name"],
                        "location": row["location"],
                        "date": row["Date"],
                    },
                    output=rating,
                )
            )
    return DataSet(rows=rows[:50], name="starbucks_reviews")



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
csv_path = "./starbucks_reviews.csv"
dataset = load_file_as_dataset(csv_path)
trainer = BayesianTrainer(
    all_rows=dataset,
    task_guidance="determine the rating of this review",
    keys=["review", "name", "location", "date"],
    expected_output_type=StarbucksReviewRating,
    scoring_function=sentiment_scoring_function,
    num_iterations=4,  # Start with fewer iterations for testing
    candidates_per_iteration=4,  # Start with fewer candidates for testing
    prompt_mode=PromptMode.SIMPLE,
    models=[
        "anthropic/claude-3-haiku-20240307",
        "openai/gpt-4.1-nano-2025-04-14",
        "openai/gpt-4.1-mini-2025-04-14",
        "groq/gemma2-9b-it",
        "groq/qwen-qwq-32b"
    ],
    failure_analysis_enabled=True
)

[32m2025-05-12 21:36:41.894[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36m__init__[0m:[36m208[0m - [1mUsing CPU for Pyro/Torch computations[0m
[32m2025-05-12 21:36:41.894[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36m__init__[0m:[36m263[0m - [1mAll rows: 50[0m
[32m2025-05-12 21:36:41.895[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36m__init__[0m:[36m727[0m - [34m[1mBayesianTrainer initialized[0m


In [8]:
await trainer.train()

[32m2025-05-12 21:36:43.601[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m4[0m - [1mStarting Bayesian optimization with Pyro: 4 iterations, 4 candidates/iter.[0m
[32m2025-05-12 21:36:43.602[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m5[0m - [1mPhase 1: Generating and evaluating initial candidates...[0m
[32m2025-05-12 21:36:43.602[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Use plain language in the prompt you write...[0m
[32m2025-05-12 21:36:45.793[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Make the prompt you write as simple as possible...[0m


As a review analysis expert, your role is to determine the overall rating of a customer review. Carefully read the review provided and assess the sentiments expressed. Consider the key points mentioned by the reviewer to decide how positive or negative the review is. Based on your assessment, assign a rating from 1 to 5, where 1 is very negative and 5 is very positive. Here's the specific context for your analysis:

- Review: {{ review }}
- Name of reviewer: {{ name }}
- Location of reviewer: {{ location }}
- Date of review: {{ date }}


[32m2025-05-12 21:36:47.339[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Make the prompt you write as simple as possible...[0m


You are a review analyst. Determine the rating for the following review based on its sentiment. Consider the tone, keywords, and overall context to assign a rating from 1 to 5. Review details:

Review: {{ review }}
Reviewer Name: {{ name }}
Location: {{ location }}
Date: {{ date }}


[32m2025-05-12 21:36:48.360[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

Use plain language in the prompt you write...[0m


As a review evaluator, analyze the provided review from {{ name }} located in {{ location }} dated {{ date }}. Determine the appropriate rating based on the content of the review.


[32m2025-05-12 21:36:49.917[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m17[0m - [1mGenerated 4 initial candidate prompts.[0m
[32m2025-05-12 21:36:49.919[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt As a review analysis expert, your role is to determine the overall rating of a customer review. Care... against 40 rows[0m
[32m2025-05-12 21:36:49.919[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using anthropic/claude-3-haiku-20240307[0m
[32m2025-05-12 21:36:49.920[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are a review analyst. Determine the rating for the following review based on its sentiment. Cons... against 40 rows[0m
[32m2025-05-12 21:36:49.921[0m | [34m[1mDEBUG   [0

As a reviewer analyzer, read the review provided below and determine its overall rating based on the sentiment and specific feedback mentioned. Pay attention to the content of the review, the name of the reviewer {{ name }}, their location {{ location }}, and the date of the review {{ date }}. Provide a rating from 1 to 5, where 1 indicates a very negative review and 5 indicates a very positive review.

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, u

[32m2025-05-12 21:40:10.945[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:40:10.947[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:40:10.950[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:40:10.951[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:40:10.956[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using anthropic/claude-3-haiku-20240307[0m
[32m2025-05-12 21:40:10.988[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalcula

You are an expert sentiment analysis system tasked with determining the appropriate rating for customer reviews based on their content. Review the submitted review for its emotional tone, context, and specific complaints. Consider the overall sentiment as well as any explicit dissatisfaction expressed by the reviewer, especially when it concerns service or product integrity. Given the review: '{{ review }}', written by '{{ name }}' from '{{ location }}' on '{{ date }}', evaluate the review and assign a rating accordingly. Note that severe complaints or expressions of disappointment should be treated with higher severity, potentially resulting in lower ratings.


[32m2025-05-12 21:40:17.774[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


As a sentiment analysis expert, your task is to evaluate the tone and context of the given review and determine a rating based on the provided criteria. Please read the review carefully and assess the overall sentiment, focusing specifically on negativity or dissatisfaction levels. Use the following review details:
- Review: {{ review }}
- Reviewer Name: {{ name }}
- Location: {{ location }}
- Date of Review: {{ date }}

Based on the sentiment conveyed in the review, assign a rating from the following options:
- Ratings.ONE: '1'
- Ratings.TWO: '2'
- Ratings.THREE: '3'
- Ratings.FOUR: '4'
- Ratings.FIVE: '5'

Be sure to consider past experiences and sentiments shared within the review to accurately reflect the reviewer's feelings in your assigned rating.


[32m2025-05-12 21:40:20.330[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


You are an expert sentiment analyzer. Your task is to determine the rating of a customer review based on its content. Please closely read the review provided, paying special attention to any expressions of dissatisfaction or negative sentiment. Assign a rating based on the following scale: 1 (very dissatisfied) to 5 (very satisfied). Use the review, customer’s name, location, and the date of the review as context to inform your analysis. Here is the review data: Review: '{{ review }}', Name: '{{ name }}', Location: '{{ location }}', Date: '{{ date }}'.


[32m2025-05-12 21:40:23.383[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m50[0m - [1mGenerated 4 new candidates for evaluation.[0m
[32m2025-05-12 21:40:23.384[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are an expert sentiment analysis system tasked with determining the appropriate rating for custo... against 40 rows[0m
[32m2025-05-12 21:40:23.385[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using groq/qwen-qwq-32b[0m
[32m2025-05-12 21:40:23.387[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt As a sentiment analysis expert, your task is to evaluate the tone and context of the given review an... against 40 rows[0m
[32m2025-05-12 21:40:23.388[0m | [34m[1mDEBUG   [0m | [36mtas

You are a sentiment analysis expert tasked with determining the appropriate rating for customer reviews of Starbucks based on the provided text of the review. Analyze the review content to identify the overall sentiment expressed, taking particular care to consider aggressive language or strong dissatisfaction that indicates a low rating. Consider the review provided by {{ name }} from {{ location }} on {{ date }}: '{{ review }}'. Based on your analysis, classify the sentiment into a rating from one (1) to five (5), with one being the most negative and five being the most positive. Return the determined rating in the specified format.


[32m2025-05-12 21:42:44.145[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:42:44.148[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:42:44.150[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:42:44.152[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:42:44.155[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using groq/qwen-qwq-32b[0m
[32m2025-05-12 21:42:44.179[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 40 scores[0

You are an advanced sentiment analysis model tasked with determining the rating of a customer review. Your output must represent the overall satisfaction level of the reviewer based on the content of the review. The review may encompass various aspects of the customer experience, including service quality, product satisfaction, and specific incidents that highlight positive or negative feelings. 

Follow these steps to deliver your output:
1. Analyze the provided review text for emotional cues, customer satisfaction levels, and the context of their experience.
2. Classify the review into a rating system where:
   - Ratings.ONE corresponds to very negative sentiment
   - Ratings.TWO corresponds to negative sentiment
   - Ratings.THREE corresponds to neutral sentiment
   - Ratings.FOUR corresponds to positive sentiment
   - Ratings.FIVE corresponds to very positive sentiment
3. Pay special attention to key phrases that indicate dissatisfaction or frustration, especially in cases where di

[32m2025-05-12 21:42:54.280[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


You are a sentiment analysis model tasked with determining the rating for customer reviews of Starbucks. Analyze the following review to assign an appropriate rating based on the expressed sentiments. Specifically, identify the tone, severity of the complaints, and overall satisfaction reflected in the text. Consider technical indicators like negative phrases, customer frustration, and instances of unprofessional service to guide your evaluation. Review Input: {{ review }}. Customer Name: {{ name }}. Location: {{ location }}. Review Date: {{ date }}.


[32m2025-05-12 21:42:57.119[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


You are a critical sentiment analysis engine tasked with determining the customer rating from a detailed review. Analyze the review content for emotional tone and customer satisfaction levels. Consider negative experiences that suggest frustration, disappointment, or poor service, particularly focusing on how the reviewer's complaints about service and management reflect their overall dissatisfaction. 

For the provided context, process the following input: 
- Review: {{ review }} 
- Customer Name: {{ name }} 
- Location: {{ location }} 
- Date of Review: {{ date }} 

Your output should categorize the sentiment into ratings, specifically returning a rating of '1' if the review conveys a significant negative experience. Be attentive to details highlighting dissatisfaction and poor service, and assign a rating based on the severity of the issues expressed.


[32m2025-05-12 21:42:59.454[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m50[0m - [1mGenerated 4 new candidates for evaluation.[0m
[32m2025-05-12 21:42:59.456[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are an advanced sentiment analysis model tasked with determining the rating of a customer review... against 40 rows[0m
[32m2025-05-12 21:42:59.456[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4.1-mini-2025-04-14[0m
[32m2025-05-12 21:42:59.458[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are a sentiment analysis model tasked with determining the rating for customer reviews of Starbu... against 40 rows[0m
[32m2025-05-12 21:42:59.459[0m | [34m[1mDEBUG   [

You are a review rating expert. Analyze the following customer review and determine its appropriate rating based on sentiment and overall customer experience. Pay special attention to negative experiences related to customer service and unresolved issues. Use the context provided to enhance your analysis. 

Review: {{ review }}
Customer Name: {{ name }}
Location: {{ location }}
Date of Review: {{ date }}

Ensure to give a rating of 1 to 5, where 1 indicates a very poor experience and 5 indicates an excellent experience.


[32m2025-05-12 21:45:05.005[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:45:05.007[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:45:05.009[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:45:05.011[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:45:05.013[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using openai/gpt-4.1-mini-2025-04-14[0m
[32m2025-05-12 21:45:05.039[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated

You are a sentiment analysis expert. Your task is to determine the appropriate rating for customer reviews of Starbucks based on the content provided. Analyze the review text carefully and consider the customer's sentiments and experiences they describe. 

Instructions:
1. Read the review text provided. 
2. Identify the overall sentiment expressed in the review. Consider both positive and negative statements and weigh them accordingly.
3. Use the following rating scale: 
   - FIVE (5): for reviews that express strong satisfaction and positive sentiments.
   - FOUR (4): for reviews that express general satisfaction but may have minor complaints.
   - THREE (3): for neutral reviews with mixed sentiments.
   - TWO (2): for reviews that express dissatisfaction but have some redeeming qualities.
   - ONE (1): for reviews that express strong dissatisfaction and negative sentiments without any redeeming qualities.
4. Assign the appropriate rating based on your analysis of the review. 

Review

[32m2025-05-12 21:45:13.160[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


You are an expert sentiment analysis model. Your task is to evaluate and determine the rating of the customer review provided. Carefully analyze the content for indicators of customer satisfaction or dissatisfaction, and pay special attention to nuanced phrases and overall sentiment context. Use the following inputs to inform your analysis: review text is {{ review }}, name of the reviewer is {{ name }}, location is {{ location }}, and the review date is {{ date }}. Assign a rating between 1 to 5 stars, where 5 represents extreme satisfaction and 1 represents extreme dissatisfaction. Be sure to use the specific phrases indicative of each rating level for accuracy.


[32m2025-05-12 21:45:17.061[0m | [1mINFO    [0m | [36mtaskllm.optimizer.prompt.meta[0m:[36mgenerate_spec[0m:[36m139[0m - [1mGenerating simple prompt content for: determine the rating of this review

The prompt should specifically handle these challenging cases:
...[0m


As a rating evaluator, analyze the following customer review and assign a rating based on the content provided. Consider the overall sentiment of the review, any specific mentions of service quality, product satisfaction, and any complaints or praise. Use the information from the customer’s name, location, and review date to inform your decision but focus primarily on the review's sentiment to determine the appropriate rating. 

Review details:
- Review: {{ review }}
- Customer Name: {{ name }}
- Location: {{ location }}
- Date of Review: {{ date }}

Respond with a rating that fits the content of the review, where possible ratings range from 1 to 5, with 1 being very negative and 5 being very positive.


[32m2025-05-12 21:45:19.850[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m50[0m - [1mGenerated 4 new candidates for evaluation.[0m
[32m2025-05-12 21:45:19.857[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are a sentiment analysis expert. Your task is to determine the appropriate rating for customer r... against 40 rows[0m
[32m2025-05-12 21:45:19.858[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using anthropic/claude-3-haiku-20240307[0m
[32m2025-05-12 21:45:19.859[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m6[0m - [1mRunning for prompt You are an expert sentiment analysis model. Your task is to evaluate and determine the rating of the... against 40 rows[0m
[32m2025-05-12 21:45:19.861[0m | [34m[1mDEBUG  

You are a ratings evaluator skilled in analyzing customer reviews. Your task is to determine the overall rating for the review provided. Carefully read the text and assess the sentiments expressed within it. Use the following scale to assign a rating: 1 star for very negative feedback, 2 stars for negative but somewhat constructive feedback, 3 stars for neutral reviews, 4 stars for positive feedback, and 5 stars for overwhelmingly positive reviews. 

Here is the review to evaluate: {{ review }}. The review was written by {{ name }} from {{ location }} on {{ date }}. After analyzing the review, provide the appropriate rating based on the sentiments expressed.


[32m2025-05-12 21:46:25.073[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30258, Requested 253. Please try again in 1.022999999s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:25.286[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30151, Requested 354. Please try again in 1.010999999s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:25.536[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30031, Requested 272. Please try again in 607ms. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:25.773[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 29915, Requested 419. Please try again in 668ms. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:26.000[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 29798, Requested 233. Please try again in 61ms. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:26.613[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30733, Requested 326. Please try again in 2.118s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:26.821[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30634, Requested 369. Please try again in 2.007s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:27.064[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30514, Requested 250. Please try again in 1.529s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:27.277[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30404, Requested 265. Please try again in 1.338s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:27.487[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30301, Requested 335. Please try again in 1.273s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:27.690[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30198, Requested 359. Please try again in 1.115s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:27.940[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30074, Requested 274. Please try again in 696ms. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:48.221[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:46:48.224[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:46:48.227[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:46:48.228[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mrun_for_prompt[0m:[36m10[0m - [1mRecording failures[0m
[32m2025-05-12 21:46:48.230[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_outputs[0m:[36m74[0m - [34m[1mGetting outputs for 40 rows using anthropic/claude-3-haiku-20240307[0m
[32m2025-05-12 21:46:48.254[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalcula


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:52.570[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30452, Requested 274. Please try again in 1.452s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m
[32m2025-05-12 21:46:52.571[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 40 scores[0m
[32m2025-05-12 21:46:52.572[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mcalculate_scores[0m:[36m116[0m - [1mModel groq/gemma2-9b-it with prompt achieved score: 12.0000, Correct: 32, Incorrect: 8, Unlabelle


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:52.958[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30254, Requested 359. Please try again in 1.226s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:53.162[0m | [31m[1mERROR   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_output[0m:[36m69[0m - [31m[1mError executing prompt: litellm.RateLimitError: RateLimitError: GroqException - {"error":{"message":"Rate limit reached for model `gemma2-9b-it` in organization `org_01jt6t4sn9em7s4mzybzfdxjzh` service tier `on_demand` on tokens per minute (TPM): Limit 30000, Used 30166, Requested 274. Please try again in 880ms. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.","type":"tokens","code":"rate_limit_exceeded"}}
[0m
[32m2025-05-12 21:46:53.184[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 40 scores[0m
[32m2025-05-12 21:46:53.184[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mcalculate_scores[0m:[36m116[0m - [1mModel anthropic/claude-3-haiku-20240307 with prompt achieved score: 7.0000, Correct: 7, Incorrect:


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:53.283[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m312[0m - [34m[1mSVI iteration 100/1000, Loss: 22.3861[0m
[32m2025-05-12 21:46:53.350[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m312[0m - [34m[1mSVI iteration 200/1000, Loss: 22.3857[0m
[32m2025-05-12 21:46:53.416[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m312[0m - [34m[1mSVI iteration 300/1000, Loss: 22.3855[0m
[32m2025-05-12 21:46:53.480[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m312[0m - [34m[1mSVI iteration 400/1000, Loss: 22.3854[0m
[32m2025-05-12 21:46:53.543[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mfit_surrogate_model[0m:[36m312[0m - [34m[1mSVI iteration 500/1000, Loss: 22.3853[0m
[32m2025-


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:46:54.747[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 40 scores[0m
[32m2025-05-12 21:46:54.747[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mcalculate_scores[0m:[36m116[0m - [1mModel openai/gpt-4.1-nano-2025-04-14 with prompt achieved score: 35.0000, Correct: 35, Incorrect: 5, Unlabelled: 0 out of 40[0m
[32m2025-05-12 21:46:54.747[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m66[0m - [1mIteration 4: No improvement found. Best score remains 35.0000[0m
[32m2025-05-12 21:46:54.748[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m69[0m - [1mPhase 3: Final evaluation on test set...[0m
[32m2025-05-12 21:46:54.748[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.bayesian[0m:[36mtrain[0m:[36m75[0m - [1mAdded Pyro dependencies to pyproject.toml (commented)[0m
[3


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



[32m2025-05-12 21:48:43.863[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 10 scores[0m
[32m2025-05-12 21:48:43.865[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mcalculate_scores[0m:[36m116[0m - [1mModel groq/qwen-qwq-32b with prompt achieved score: 8.0000, Correct: 8, Incorrect: 2, Unlabelled: 0 out of 10[0m
[32m2025-05-12 21:48:43.866[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 10 scores[0m
[32m2025-05-12 21:48:43.867[0m | [1mINFO    [0m | [36mtaskllm.optimizer.methods.base[0m:[36mcalculate_scores[0m:[36m116[0m - [1mModel groq/qwen-qwq-32b with prompt achieved score: 0.0000, Correct: 0, Incorrect: 10, Unlabelled: 0 out of 10[0m
[32m2025-05-12 21:48:43.867[0m | [34m[1mDEBUG   [0m | [36mtaskllm.optimizer.methods.base[0m:[36mget_scores[0m:[36m91[0m - [34m[1mCalculated 10 sco