In [1]:
%load_ext autoreload
%autoreload 2

# Text Classification

In [13]:
import logging

# Configure the logging settings
logging.basicConfig(
    # level=logging.INFO,  # Set the minimum log level to DEBUG
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  # Define the log format
    handlers=[
        logging.FileHandler('app.log'),  # Log to a file
        logging.StreamHandler()  # Log to the console
    ]
)

# Create a logger
logger = logging.getLogger('my_app')

# Kaggle E-commerce Dataset

In [3]:
import pandas as pd
import logging

df_data = pd.read_pickle('./data/Ecommerce/ecommerce_classification_dataset.pkl')
possible_labels = list(df_data['label'].unique())
df_data

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


## Train - test split

In [4]:
from AutoLLM.utils.helpers import split_dataframe

df_train, df_test = split_dataframe(df_data, 100 / df_data.shape[0], 42, 'label')
display(df_test)
display(df_train)

Unnamed: 0,label,text
32109,Clothing & Accessories,Rovars Unisex All-in-1 Suit Planning a trip to...
21746,Books,Wiley's Physical Chemistry For JEE (Main & Adv...
1214,Household,Ebee Store Shoe Rack with 5 Shelves (Maroon) G...
25733,Books,Strauss Bronx YB Skateboard Skateboard is a ty...
43918,Electronics,Cables Kart™ Digital to Analogue Optical Audio...
...,...,...
24703,Books,New Saraswati Health and Physical Education Cl...
13919,Household,FRESHWORLD Plastic Vacuum Sealer with Starter ...
24624,Books,Current Affairs Yearly 2019
49880,Electronics,IKALL K16 1.8-inch Mobile(White) Colour:White ...


Unnamed: 0,label,text
30368,Books,I Am Malala: The Girl Who Stood Up for Educati...
15227,Household,"IFB 5.5 kg Dryer (Turbo Dry EX, Silver) Color:..."
42493,Electronics,Skullcandy JIB S2DUDZ-003 In-Ear Headphone (Bl...
30360,Books,I Am Malala: The Girl Who Stood Up for Educati...
43558,Electronics,"DELL Inspiron 3464, 2017 24 ""All-In-One i5 7th..."
...,...,...
44072,Electronics,ULTIMA Invisible Cable Aluminium Universal Adj...
8258,Household,"Aventure Shoe Rack Free Standing,Cube Organize..."
46864,Electronics,Fujifilm X Series GFX 50S 51.4MP Mirrorless Me...
49032,Electronics,"Ceuta Retails, Car Cradle for Cell Phones - Un..."


# Classifer

## Build classifier agent

In [5]:
from pydantic import BaseModel
from typing import Literal
from AutoLLM.interfaces.api_client import APIClient
from config import API_KEY
from AutoLLM.prompts.classifier import classifier_template
from AutoLLM.modules.base_agent import BaseAgent
import json
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

url = "https://api.studio.nebius.ai/v1/"

classifier_client = APIClient(url=url, api_key=API_KEY, model="meta-llama/Llama-3.2-3B-Instruct")
classifier_gen_config = {
    "temperature": 1e-10,
    "top_p": 1e-10,
    # "top_k": 1,
}
classifier_client.load_generation_config(classifier_gen_config)

class classifier_schema(BaseModel):
    label: str

class ClassifierAgent(BaseAgent):
    def __init__(self, client, json_schema, gen_config):
        super().__init__(client, json_schema, gen_config)
        self.template = classifier_template
        self.guide = '{"label": '
        self.output_format = f"""label: (Literal) Return one of the choices from the following list: {possible_labels}."""
        self.system_message = "You are a helpful AI assistant."
        self.instructions = ""
        self.X = None
        self.y_true = None
    
    def _generate_prompt(self, **kwargs):
        if not self.instructions:
            if 'instructions' in kwargs:
                self.instructions = kwargs['instructions']
            else:
                raise ValueError("Instructions not set.")

        self.user_prompt = self.template.format(
            instructions=self.instructions,
            output_format=self.output_format,
            input=kwargs['input']
        )
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": self.user_prompt},
            {"role": "assistant", "content": self.guide},
        ]
        return messages
    
    def _parse_response(self, response):
        return json.loads(response)['label']

    def run_samples(self, X):
        y_pred = []
        for input in tqdm(X):
            label = self.run(input=input)
            y_pred.append(label)
        return y_pred
    
    def evaluate_accuracy(self, X=None, y_true=None):
        if X is None or y_true is None:
            X = self.X
            y_true = self.y_true
        y_pred = self.run_samples(X)
        return accuracy_score(y_true, y_pred)
    
    def load_data(self, X, y_true):
        self.X = X
        self.y_true = y_true

## Test classifier agent

In [None]:
initial_instruction = f"Based on the given input item description, label the item as one of the following: {possible_labels}."

ca = ClassifierAgent(classifier_client, classifier_sc
                     hema, classifier_gen_config)
ca.instructions = initial_instruction
print(ca.run_samples(df_test['text'].to_list()))
print(ca.evaluate_accuracy(
    X=df_test['text'],
    y_true=df_test['label']
))

  0%|          | 0/100 [00:00<?, ?it/s]

2025-02-06 15:04:33,604 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:33,995 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:34,377 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:34,762 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:35,207 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:35,664 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:36,055 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:04:36,445 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "H

['Clothing & Accessories', 'Books', 'Household', 'Electronics', 'Electronics', 'Household', 'Household', 'Books', 'Books', 'Clothing & Accessories', 'Household', 'Clothing & Accessories', 'Household', 'Household', 'Clothing & Accessories', 'Household', 'Household', 'Clothing & Accessories', 'Clothing & Accessories', 'Electronics', 'Books', 'Household', 'Household', 'Electronics', 'Household', 'Household', 'Household', 'Household', 'Books', 'Books', 'Books', 'Household', 'Household', 'Books', 'Electronics', 'Books', 'Household', 'Household', 'Books', 'Books', 'Household', 'Household', 'Electronics', 'Household', 'Household', 'Household', 'Household', 'Household', 'Electronics', 'Books', 'Household', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Books', 'Books', 'Clothing & Accessories', 'Books', 'Household', 'Electronics', 'Books', 'Clothing & Accessories', 'Clothing & Accessories', 'Household', 'Electronics', 'Household', 'Household', 'Household', 'Clothing & Accessories

  0%|          | 0/100 [00:00<?, ?it/s]

2025-02-06 15:05:18,565 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:19,079 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:19,469 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:19,846 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:20,244 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:20,629 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:20,993 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 15:05:22,355 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "H

0.79


# Optimize prompt

In [7]:
from AutoLLM.modules.mutation_agent import MutationAgent
from AutoLLM.modules.critic_agent import CriticAgent
from AutoLLM.modules.refine_agent import RefineAgent
from AutoLLM.modules.expert_agent import ExpertAgent

meta_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
meta_generation_config = {
    "temperature": 0.7,
    "top_p": 0.9,
}


class OptimusPrompt:
    def __init__(
        self,
        task_description,
        initial_instruction,
        meta_client,
        meta_generation_config,
        num_mutation_variations=10,
        num_refine_variations=5,
    ):
        self.task_description = task_description
        self.initial_instruction = initial_instruction
        self.instruction_population = []
        
        self.num_mutation_variations = num_mutation_variations
        self.num_refine_variations = num_refine_variations
        self.instruction_cache = [self.initial_instruction]
        self.meta_client = meta_client
        self.meta_generation_config = meta_generation_config
        
    
    def mutate(self, seed_instruction):
        self.mutation_agent = MutationAgent(self.meta_client, self.meta_generation_config)
        mutations = self.mutation_agent.run(
            task_description=self.task_description,
            num_variations=self.num_mutation_variations,
            seed_instruction=seed_instruction,
        )
        return mutations
    
    def score_cache(self, eval_func):
        for instruction in self.instruction_cache:
            score = eval_func(instruction)
            self.instruction_population.append((instruction, score))
        self.instruction_cache = []
    
    def select_top_k_instructions(self, k):
        self.instruction_population.sort(key=lambda x: x[1], reverse=True)
        return [instruction for instruction, _ in self.instruction_population[:k]]
    
    def build_wrong_example_text(self, wrong_examples):
        wrong_examples_text = []
        for text, label_true, label_pred in wrong_examples:
            wrong_examples_text.append(f"""
        [Text]:               {text}
        [Correct Label]:      {label_true}
        [Predicted Label]:    {label_pred}""")
        return "\n".join(wrong_examples_text)
    
    def critique(self, seed_instruction, wrong_example_text):
        self.critic_agent = CriticAgent(self.meta_client, self.meta_generation_config)
        critique = self.critic_agent.run(
            task_description=self.task_description,
            seed_instruction=seed_instruction,
            wrong_examples=wrong_example_text,
        )
        return critique
    
    def build_wrong_example_with_critique_text(self, wrong_examples, critique):
        wrong_examples_with_critique = []
        for i in range(len(wrong_examples)):
            
            wrong_examples_with_critique.append(f"""
                [Text]:               {wrong_examples[i][0]}
                [Correct Label]:      {wrong_examples[i][1]}
                [Predicted Label]:    {wrong_examples[i][2]}
                [Critique]:           {critique[i]}""")
        return "\n".join(wrong_examples_with_critique)
            

    def refine(self, seed_instruction, wrong_examples_with_critique):
        self.refine_agent = RefineAgent(self.meta_client, self.meta_generation_config)
        refined_instructions = self.refine_agent.run(
            task_description=self.task_description,
            instruction=seed_instruction,
            examples=wrong_examples_with_critique,
            num_variations=self.num_refine_variations,
        )
        return refined_instructions
        
    def get_expert(self, seed_instruction):
        self.expert_agent = ExpertAgent(self.meta_client, self.meta_generation_config)
        expert = self.expert_agent.run(
            instruction=seed_instruction,
        )
        return expert


# Optimization loop

In [8]:
task_description = "Label E-commerce products as their product types given their product description."
optimus_prime = OptimusPrompt(
    task_description=task_description,
    initial_instruction=initial_instruction,
    meta_client=meta_client,
    meta_generation_config=meta_generation_config,
    num_mutation_variations=10,
    num_refine_variations=5
)

def eval_func(instruction):
    ca.instructions = instruction
    return ca.evaluate_accuracy(X=df_test['text'], y_true=df_test['label'])

X = df_test['text'].to_list()
y_true = df_test['label'].to_list()

# score cached instructions
optimus_prime.score_cache(eval_func=eval_func)

# select best instructions
best_instruction = optimus_prime.select_top_k_instructions(k=1)[0]

num_rounds = 1
for i in tqdm(range(num_rounds)):
    logger.info(f"Round {i+1}")
    
    # mutate best instruction
    mutations = optimus_prime.mutate(seed_instruction=best_instruction)
    optimus_prime.instruction_cache += mutations
    logger.info(f"Mutations completed for {len(mutations)} instructions")

    # critique best instruction
    y_pred = ca.run_samples(X)
    wrong_examples = []
    for i in range(len(y_pred)):
        if y_pred[i] != y_true[i]:
            wrong_examples.append((X[i], y_true[i], y_pred[i]))
    logger.info(f"Found {len(wrong_examples)} wrong examples")

    wrong_example_text = optimus_prime.build_wrong_example_text(wrong_examples)
    critique = optimus_prime.critique(wrong_example_text=wrong_example_text, seed_instruction=best_instruction)
    wrong_example_with_critique = optimus_prime.build_wrong_example_with_critique_text(wrong_examples, critique)
    logger.info('Critique completed')

    # refine best instruction
    refined_instructions = optimus_prime.refine(seed_instruction=best_instruction, wrong_examples_with_critique=wrong_example_with_critique)
    logger.info('Refine completed')

    # add refined instructions to instruction cache
    optimus_prime.instruction_cache += refined_instructions

    # score cached instructions
    logger.info('Scoring cached instructions')
    optimus_prime.score_cache(eval_func=eval_func)

    # select best instructions
    best_instruction = optimus_prime.select_top_k_instructions(k=1)[0]

system_message = optimus_prime.get_expert(seed_instruction=best_instruction)

print(system_message)
print(best_instruction)




  0%|          | 0/100 [00:00<?, ?it/s]

2025-02-06 14:56:39,095 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:39,468 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:39,847 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:40,721 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:41,105 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:41,469 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:41,850 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:56:42,218 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "H

  0%|          | 0/1 [00:00<?, ?it/s]

2025-02-06 14:57:28,358 - my_app - INFO - Round 1


Instructions:
Given a task description and a seed task instruction used for an agent to complete a task, you are to generate diverse variations of the seed task instruction by applying various thinking styles.

[Task Description]: Label E-commerce products as their product types given their product description.
[Seed Task Instruction]: Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].

Steps:
- Analyze the task description and the seed task instruction. Think about how the seed task instruction accomplishes the task described.
- Consider the following thinking styles and apply them to the seed task instruction. For each thinking style, thinking about how the seed task instruction may be improved by applying the thinking style.
- Generate 10 mutated instructions from the seed task instruction by applying each thinking styles.
- You may dynamically mix thinking styles to create more diverse 

2025-02-06 14:57:39,918 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:39,921 - my_app - INFO - Mutations completed for 10 instructions


  0%|          | 0/100 [00:00<?, ?it/s]

2025-02-06 14:57:41,031 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:41,387 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:41,864 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:42,234 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:42,615 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:42,984 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:43,398 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-06 14:57:43,912 - httpx - INFO - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions "H

LLM thinking: The current instruction does not provide enough context or guidance for the agent to correctly identify the product type based on the description. The agent seems to struggle with distinguishing between products that could be classified into multiple categories or those that are not clearly defined within the given options. The instruction should be refined to include specific keywords or phrases that are indicative of each product type, and possibly a hierarchy or set of rules to follow when the description hints at multiple categories.


IndexError: list index out of range

In [12]:
critique

["The instruction fails to consider that skateboards are part of sports equipment and not electronics or household items. The agent might have been misled by the mention of 'carbon ball-bearings'. The instruction should include sports equipment as a separate category or at least mention skateboards under the 'Household' or 'Electronics' categories to avoid confusion.",
 "The agent mislabels the cleaning air blower brush as 'Household' instead of 'Electronics'. The description does not clearly differentiate between these two categories. The instruction should be improved to include more specific keywords that are typically associated with electronics (e.g., 'battery-powered', 'electrical components', 'digital', etc.) and household items (e.g., 'cleaning supplies', 'kitchenware', 'furniture', etc.).",
 "The agent incorrectly labels a piece of jewelry as 'Household'. The description of the pendant includes words that might be associated with household items if the context is not clear, su

In [11]:
len(wrong_examples)

21

In [None]:
print("Final Evaluation")
ca.instructions = best_instruction
ca.system_message = system_message
print(f"Accuracy = {ca.evaluate_accuracy(X=X, y_true=y_true)}")

Final Evaluation


  0%|          | 0/10 [00:00<?, ?it/s]

2025-02-06 14:51:23,122 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': "You are an information scientist with a specialization in data analysis and categorization, particularly within the domains of retail and library science. Your expertise lies in parsing and analyzing text for keywords and accurately categorizing items based on their descriptions. You have extensive experience in identifying and organizing items into predefined categories such as 'Household', 'Books', 'Clothing & Accessories', and 'Electronics'. Your skills are highly valuable in ensuring that items are correctly labeled, making it easier for users to find what they are looking for. You are adept at using sophisticated analytical tools and your deep understanding of each category allows you to make informed decisions about how to best classify items."}, {'role': 'user', 'content': 'Instructions:\n

Accuracy = 0.9


In [15]:
optimus_prime.instruction_population

[("List of ideas: Consider the following steps: analyze the text for keywords, compare the description with known items in each category, and then label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].",
  0.9),
 ("Label the item based on the product description: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']. Items designed for babies and children, especially those with descriptors like 'Romper' and 'Jumpsuit', should be labeled as 'Clothing & Accessories'.",
  0.9),
 ("Determine the category of the item from the description: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']. Items that hold or support electronic devices like TVs should be labeled under 'Electronics'.",
  0.9),
 ("Step-by-step approach: First, read the item description carefully. Then, identify key words or phrases that suggest the product type. Finally, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessorie