In [1]:
%load_ext autoreload
%autoreload 2

# Text Classification Task

E-commerce text classification using data from kaggle

# Dataset

In [2]:
import pandas as pd

df_data = pd.read_pickle('./data/Ecommerce/ecommerce_classification_dataset.pkl')
possible_labels = list(df_data['label'].unique())
df_data

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


## Train-test-split

In [3]:
from AutoLLM.utils.helpers import split_dataframe

df_train, df_test = split_dataframe(df_data, 10 / df_data.shape[0], 42, 'label')
display(df_test)
display(df_train)

Unnamed: 0,label,text
13603,Household,"Pigeon Popcorn Maker, 1200watts, Yellow No mov..."
44708,Electronics,boAt BassHeads 225 in-Ear Super Extra Bass Hea...
13340,Household,Prestige PIC 3.1 V3 2000-Watt Induction Cookto...
26583,Books,Think & Grow Rich - Lectures by Napoleon Hill ...
44130,Electronics,"DeckUp Meritus-S Wall TV Unit (Dark Wenge, Mat..."
18908,Household,"Black+Decker Hand Tool Kit (108-Piece), Orange..."
20173,Books,Alibaba: The House that Jack Ma Built Review “...
31485,Clothing & Accessories,Baby Boy/Girl Romper Newborn Jumpsuit Blue Hoo...
34088,Clothing & Accessories,S4S Men's 100% Cotton Premium Collection Handk...
11222,Household,Kraft Seeds Multipurpose Kitchen Household and...


Unnamed: 0,label,text
24728,Books,UGC Net Education About the Author An editoria...
29641,Books,Oswaal Karnataka SSLC Question Bank Class 10 S...
9408,Household,Chef Direct Stainless Steel Dome Lid with Knob...
6530,Household,"JDX Reliance Fiber Filler Cushion, 16X16 Inch,..."
5667,Household,Magideal 2x Silky Soft Satin Standard Pillow C...
...,...,...
46826,Electronics,Canon EOS 1500D Digital SLR Camera (Black) wit...
15524,Household,Philips GC504/35 1600-Watt Garment Steamer (Pi...
48524,Electronics,LONPOO Compact HD DVD Player All Region Free (...
49605,Electronics,Intex IT-PB12.5K 12500 mAH Power Bank (Black-R...


# Simple classification

## Classification agent

In [8]:
from pydantic import BaseModel
from typing import Literal
from AutoLLM.interfaces.api_client import APIClient
from config import API_KEY
from AutoLLM.prompts.classifier import classifier_template
from AutoLLM.modules.base_agent import BaseAgent
import json
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

url = "https://api.studio.nebius.ai/v1/"

classifier_client = APIClient(url=url, api_key=API_KEY, model="meta-llama/Llama-3.2-3B-Instruct")
classifier_gen_config = {
    "temperature": 1e-10,
    "top_p": 1e-10,
    # "top_k": 1,
}
classifier_client.load_generation_config(classifier_gen_config)

class classifier_schema(BaseModel):
    label: str

class ClassifierAgent(BaseAgent):
    def __init__(self, client, json_schema, gen_config):
        super().__init__(client, json_schema, gen_config)
        self.template = classifier_template
        self.guide = '{"label": '
        self.output_format = f"""label: (Literal) Return one of the choices from the following list: {possible_labels}."""
        self.system_message = "You are a helpful AI assistant."
        self.instructions = ""
    
    def _generate_prompt(self, **kwargs):
        if not self.instructions:
            if 'instructions' in kwargs:
                self.instructions = kwargs['instructions']
            else:
                raise ValueError("Instructions not set.")

        self.user_prompt = self.template.format(
            instructions=self.instructions,
            output_format=self.output_format,
            input=kwargs['input']
        )
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": self.user_prompt},
            {"role": "assistant", "content": self.guide},
        ]
        return messages
    
    def _parse_response(self, response):
        return json.loads(response)['label']

    def run_samples(self, X):
        y_pred = []
        for input in tqdm(X):
            label = self.run(input=input)
            y_pred.append(label)
        return y_pred
    
    def evaluate_accuracy(self, X, y_true):
        y_pred = self.run_samples(X)
        return accuracy_score(y_true, y_pred)
    




ca = ClassifierAgent(classifier_client, classifier_schema, classifier_gen_config)
ca.instructions = f"Based on the given input item description, label the item as one of the following: {possible_labels}."
print(ca.run_samples(df_test['text'].to_list()))
print(ca.evaluate_accuracy(
    X=df_test['text'],
    y_true=df_test['label']
))

  0%|          | 0/10 [00:00<?, ?it/s]

['Household', 'Electronics', 'Electronics', 'Books', 'Household', 'Household', 'Household', 'Household', 'Clothing & Accessories', 'Household']


  0%|          | 0/10 [00:00<?, ?it/s]

0.6


# Mutation

In [9]:
population = []

initial_instruction = f"Based on the given input item description, label the item as one of the following: {possible_labels}."
ca.instructions = initial_instruction
acc = ca.evaluate_accuracy(
    X=df_test['text'],
    y_true=df_test['label']
)

population.append([acc, initial_instruction])
print(population)

  0%|          | 0/10 [00:00<?, ?it/s]

[[0.6, "Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']."]]


## Mutation agent

In [11]:
from AutoLLM.modules.prompt_mutator import PromptMutator, PromptMutatorSchema
from AutoLLM.modules.mutation_agent import MutationAgent

mutation_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
mutation_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
    # "top_k": 1,
}

ma = MutationAgent(mutation_client, mutation_gen_config)
task_description = "Label E-commerce products as their product types given their product description."
num_variations = 5
mutations = ma.run(
    task_description=task_description,
    num_variations=num_variations,
    seed_instruction=initial_instruction,
)


# pm = PromptMutator(mutation_client)
# resp = pm.mutate_prompt(
#     seed_prompt=initial_instruction,
#     task_description="Label E-commerce products as their product types given their product description.",
#     num_variations=5,
# )

for instruction in tqdm(mutations):
    ca.instructions = instruction
    acc = ca.evaluate_accuracy(X=df_test['text'], y_true=df_test['label'])
    population.append([acc, instruction])

print(population)

Instructions:
Given a task description and a seed task instruction used for an agent to complete a task, you are to generate diverse variations of the seed task instruction by applying various thinking styles.

[Task Description]: Label E-commerce products as their product types given their product description.
[Seed Task Instruction]: Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].

Steps:
- Analyze the task description and the seed task instruction. Think about how the seed task instruction accomplishes the task described.
- Consider the following thinking styles and apply them to the seed task instruction. For each thinking style, thinking about how the seed task instruction may be improved by applying the thinking style.
- Generate 5 mutated instructions from the seed task instruction by applying each thinking styles.
- You may dynamically mix thinking styles to create more diverse v

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[[0.6, "Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']."], [0.7, "Innovatively categorize the item described into one of these types: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'], considering unique features that might redefine traditional categories."], [0.8, "Based on the cultural significance and social impact implied by the product description, assign the item to one of these categories: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']."], [0.7, "Given the constraints of limited storage space in a warehouse, label the item described as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'], prioritizing space-efficient categorization."], [0.8, "Collaborate with team members to determine the most accurate category for the item described among: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']. Discuss and l

In [12]:
for s, i in population:
    print(s, i)

0.6 Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].
0.7 Innovatively categorize the item described into one of these types: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'], considering unique features that might redefine traditional categories.
0.8 Based on the cultural significance and social impact implied by the product description, assign the item to one of these categories: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].
0.7 Given the constraints of limited storage space in a warehouse, label the item described as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'], prioritizing space-efficient categorization.
0.8 Collaborate with team members to determine the most accurate category for the item described among: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']. Discuss and leverage diverse perspectives

# Critique and refine

## Critique

### Evaluate initial prompt and save wrong examples

In [15]:
ca.instructions = initial_instruction
X = df_test['text'].to_list()
y_true = df_test['label'].to_list()
y_pred = ca.run_samples(X)

wrong_examples = []
for i in range(len(y_pred)):
    if y_pred[i] != y_true[i]:
        wrong_examples.append((X[i], y_true[i], y_pred[i]))

print(wrong_examples)

  0%|          | 0/10 [00:00<?, ?it/s]

[('Prestige PIC 3.1 V3 2000-Watt Induction Cooktop with Touch Panel (Black) Style:PIC 3.1 V3 2000-Watt Induction Cooktop with Touch panel   The innovative Prestige Induction Cook-Top offers quick and consistent heating at highest efficiency. Add to this its anti-magnetic wall, soft touch buttons, durability, safety and elegance makes cooking a pleasing and enjoyable experience. The Prestige Induction Cook-Top comes with the unique whistle counter, power saver technology and keep warm function wherein the Induction Cook-Top dynamically monitors the temperature of the vessel and adjusts the power level based on the size (base diameter) of the vessel. The dual heat sensor provided in the Induction Cook-Top automatically senses the temperature of the vessel and prevents over heating thereby saving power. With latest technology and features, this cooktop will be the best addition to your kitchen!', 'Household', 'Electronics'), ('DeckUp Meritus-S Wall TV Unit (Dark Wenge, Matte Finish) Color

In [18]:
from AutoLLM.modules.critic_agent import CriticAgent

critique_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
critique_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
}

wrong_examples_text = []
for text, label_true, label_pred in wrong_examples:
    wrong_examples_text.append(f"""
[Text]:               {text}
[Correct Label]:      {label_true}
[Predicted Label]:    {label_pred}""")

critic_agent = CriticAgent(critique_client, critique_gen_config)
critique = critic_agent.run(
    seed_instruction=initial_instruction,
    task_description=task_description,
    wrong_examples='\n'.join(wrong_examples_text),
)

print(critique)


LLM thinking: The current instruction is somewhat vague and does not provide enough context or specific criteria for distinguishing between product types, especially for items that could fall into multiple categories based on their descriptions. The agent might be classifying products based on keywords that are not sufficiently indicative of the product type or are too broad, leading to incorrect categorization. For example, 'Cooktop' could be associated with 'Electronics' due to its technological features, while 'TV Unit' might be seen as 'Household' because it is furniture. Similarly, a book review might be misclassified as 'Household' due to the mention of 'product description' in the text, and a 'Romper' might be incorrectly labeled as 'Household' due to its use in a domestic setting.
["The instruction lacks specific criteria for distinguishing between 'Household' and 'Electronics' products. Keywords like 'induction', 'touch panel', and 'technology' might lead the agent to classify

# Refine

In [26]:
from AutoLLM.modules.refine_agent import RefineAgent
wrong_examples_with_critique = []
for i in range(len(wrong_examples_text)):
    wrong_examples_with_critique.append(wrong_examples_text[i] + f"\n[Critique]: {critique[i]}")

# print('\n'.join(wrong_examples_with_critique))

refine_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
refine_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
}
refine_agent = RefineAgent(refine_client, refine_gen_config)
refined_prompts = refine_agent.run(
    task_description=task_description,
    instruction=initial_instruction,
    examples=wrong_examples_with_critique,
    num_variations=5,
)

for rp in refined_prompts:
    print(rp)

[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Instruction:\nI\'m trying to write a zero-shot instruction that will help the most capable and suitable agent to solve the following task. You are to refine the current instruction based on the critique provided.\n\n[Task Description]: Label E-commerce products as their product types given their product description.\n\nMy current instruction is: Based on the given input item description, label the item as one of the following: [\'Household\', \'Books\', \'Clothing & Accessories\', \'Electronics\'].\n\n-------------------\nExamples:\nThe following are examples where the current instruction has gone wrong and corresponding critiques of the current instruction for each example.\n\n["\\n[Text]:               Prestige PIC 3.1 V3 2000-Watt Induction Cooktop with Touch Panel (Black) Style:PIC 3.1 V3 2000-Watt Induction Cooktop with Touch panel   The innovative Prestige Induction Cook-Top offers quick 

## Test refined prompts

In [27]:
for rp in refined_prompts:
    ca.instructions=rp
    acc = ca.evaluate_accuracy(
        X=df_test['text'],
        y_true=df_test['label']
    )
    print(acc)

  0%|          | 0/10 [00:00<?, ?it/s]

0.6


  0%|          | 0/10 [00:00<?, ?it/s]

0.8


  0%|          | 0/10 [00:00<?, ?it/s]

0.8


  0%|          | 0/10 [00:00<?, ?it/s]

0.8


  0%|          | 0/10 [00:00<?, ?it/s]

0.8


# Expert Generation

In [29]:
from AutoLLM.modules.expert_agent import ExpertAgent
expert_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
expert_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
}

expert_agent = ExpertAgent(client=expert_client, gen_config=expert_gen_config)
agent_description = expert_agent.run(instruction=initial_instruction)
print(agent_description)

LLM thinking: The task requires categorizing an item based on its description into predefined categories. This suggests that the most suitable agent would have expertise in product classification and organization, likely someone who works in retail or inventory management and is familiar with common product categories.
You are a retail product manager with extensive experience in organizing and categorizing items for efficient inventory management and customer navigation. Your role requires a keen understanding of various product types and their classification into specific categories such as Household, Books, Clothing & Accessories, and Electronics. You have a thorough knowledge of product attributes and how they align with different categories, making you highly capable of accurately labeling items based on their descriptions. Your experience in retail and inventory systems ensures that you can quickly identify key characteristics of an item and assign it to the correct category, the

### Test new expert

In [31]:
ca.system_message = agent_description
ca.instructions = initial_instruction

ca.evaluate_accuracy(X=df_test['text'], y_true=df_test['label'])

  0%|          | 0/10 [00:00<?, ?it/s]

0.8

In [32]:
ca.system_message = "You are a helpful AI assistant."
ca.evaluate_accuracy(X=df_test['text'], y_true=df_test['label'])

  0%|          | 0/10 [00:00<?, ?it/s]

0.7