In [1]:
%load_ext autoreload
%autoreload 2

# Text Classification Task

E-commerce text classification using data from kaggle

# Dataset

In [2]:
import pandas as pd

df_data = pd.read_pickle('./data/Ecommerce/ecommerce_classification_dataset.pkl')
possible_labels = list(df_data['label'].unique())
df_data

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


## Train-test-split

In [3]:
from AutoLLM.utils.helpers import split_dataframe

df_train, df_test = split_dataframe(df_data, 10 / df_data.shape[0], 42, 'label')
display(df_test)
display(df_train)

Unnamed: 0,label,text
13603,Household,"Pigeon Popcorn Maker, 1200watts, Yellow No mov..."
44708,Electronics,boAt BassHeads 225 in-Ear Super Extra Bass Hea...
13340,Household,Prestige PIC 3.1 V3 2000-Watt Induction Cookto...
26583,Books,Think & Grow Rich - Lectures by Napoleon Hill ...
44130,Electronics,"DeckUp Meritus-S Wall TV Unit (Dark Wenge, Mat..."
18908,Household,"Black+Decker Hand Tool Kit (108-Piece), Orange..."
20173,Books,Alibaba: The House that Jack Ma Built Review “...
31485,Clothing & Accessories,Baby Boy/Girl Romper Newborn Jumpsuit Blue Hoo...
34088,Clothing & Accessories,S4S Men's 100% Cotton Premium Collection Handk...
11222,Household,Kraft Seeds Multipurpose Kitchen Household and...


Unnamed: 0,label,text
24728,Books,UGC Net Education About the Author An editoria...
29641,Books,Oswaal Karnataka SSLC Question Bank Class 10 S...
9408,Household,Chef Direct Stainless Steel Dome Lid with Knob...
6530,Household,"JDX Reliance Fiber Filler Cushion, 16X16 Inch,..."
5667,Household,Magideal 2x Silky Soft Satin Standard Pillow C...
...,...,...
46826,Electronics,Canon EOS 1500D Digital SLR Camera (Black) wit...
15524,Household,Philips GC504/35 1600-Watt Garment Steamer (Pi...
48524,Electronics,LONPOO Compact HD DVD Player All Region Free (...
49605,Electronics,Intex IT-PB12.5K 12500 mAH Power Bank (Black-R...


# Simple classification

## Classification agent

In [4]:
from pydantic import BaseModel
from typing import Literal
from AutoLLM.interfaces.api_client import APIClient
from config import API_KEY
from AutoLLM.prompts.classifier import classifier_template
import json
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

url = "https://api.studio.nebius.ai/v1/"

classifier_client = APIClient(url=url, api_key=API_KEY, model="meta-llama/Llama-3.2-3B-Instruct")
classifier_gen_config = {
    "temperature": 1e-10,
    "top_p": 1e-10,
    # "top_k": 1,
}
classifier_client.load_generation_config(classifier_gen_config)

class classifier_schema(BaseModel):
    # label: Literal[possible_labels]
    label: str

class ClassifierAgent:

    def __init__(self, client, json_schema):
        self.client = client
        self.json_schema = json_schema
        self.client.load_json_schema(self.json_schema)
        self.template = classifier_template
        self.guide = '{"label": '
        self.output_format = f"""label: (Literal) Return one of the choices from the following list: {possible_labels}."""
        self.instructions = ""
        self.input=""
        self.system_message = "You are a helpful AI assistant."

    def _build_prompt(self):
        if not self.instructions or not self.input:
            raise ValueError("Instructions or input not found.")
        self.user_prompt = self.template.format(
            instructions=self.instructions,
            output_format=self.output_format,
            input=self.input
        )

    def run_sample(self, input):
        self.input = input
        self._build_prompt()
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": self.user_prompt},
            {"role": "assistant", "content": self.guide},
        ]
        resp = self.client.chat_completion(messages)
        resp = json.loads(resp)
        return resp['label']
    
    def run_samples(self, X):
        y_pred = []
        for input in tqdm(X):
            label = self.run_sample(input)
            y_pred.append(label)
        return y_pred
    
    def evaluate_accuracy(self, X, y_true):
        y_pred = self.run_samples(X)
        return accuracy_score(y_true, y_pred)



ca = ClassifierAgent(classifier_client, classifier_schema)
ca.instructions = f"Based on the given input item description, label the item as one of the following: {possible_labels}."
print(ca.run_samples(df_test['text'].to_list()))
print(ca.evaluate_accuracy(
    X=df_test['text'],
    y_true=df_test['label']
))

  0%|          | 0/10 [00:00<?, ?it/s]

['Household', 'Electronics', 'Electronics', 'Books', 'Household', 'Household', 'Household', 'Household', 'Clothing & Accessories', 'Household']


  0%|          | 0/10 [00:00<?, ?it/s]

0.6


# Mutation

In [5]:
population = []

initial_instruction = f"Based on the given input item description, label the item as one of the following: {possible_labels}."
ca.instructions = initial_instruction
acc = ca.evaluate_accuracy(
    X=df_test['text'],
    y_true=df_test['label']
)

population.append([acc, initial_instruction])
print(population)

  0%|          | 0/10 [00:00<?, ?it/s]

[[0.6, "Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']."]]


## Mutation agent

In [6]:
from AutoLLM.modules.prompt_mutator import PromptMutator, PromptMutatorSchema

mutation_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
mutation_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
    # "top_k": 1,
}
mutation_client.load_generation_config(mutation_gen_config)


pm = PromptMutator(mutation_client)
resp = pm.mutate_prompt(
    seed_prompt=initial_instruction,
    task_description="Label E-commerce products as their product types given their product description.",
    num_variations=5,
)

for instruction in tqdm(resp):
    ca.instructions = instruction
    acc = ca.evaluate_accuracy(X=df_test['text'], y_true=df_test['label'])
    population.append([acc, instruction])

print(population)

Instructions:
Given a task description and a seed task instruction used for an agent to complete a task, you are to generate diverse variations of the seed task instruction by applying various thinking styles.

[Task Description]: Label E-commerce products as their product types given their product description.
[Seed Task Instruction]: Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].

Steps:
- Analyze the task description and the seed task instruction. Think about how the seed task instruction accomplishes the task described.
- Consider the following thinking styles and apply them to the seed task instruction. For each thinking style, thinking about how the seed task instruction may be improved by applying the thinking style.
- Generate 5 mutated instructions from the seed task instruction by applying each thinking styles.
- You may dynamically mix thinking styles to create more diverse v

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[[0.6, "Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics']."], [0.8, "Design an experiment to categorize the input item descriptions into product types: 'Household', 'Books', 'Clothing & Accessories', 'Electronics'. Use a set of predefined criteria and test different approaches to see which yields the highest accuracy."], [0.9, "List and implement various methods to classify the item descriptions into one of these categories: 'Household', 'Books', 'Clothing & Accessories', 'Electronics'. Evaluate each method's effectiveness and choose the most reliable one."], [0.7, "Identify and analyze the key factors in the product descriptions that contribute to mislabeling items into categories such as 'Household', 'Books', 'Clothing & Accessories', and 'Electronics'. Adjust the instruction to account for these factors."], [0.5, "Question the assumption that the provided categories ('Household', 'Books'

In [7]:
for s, i in population:
    print(s, i)

0.6 Based on the given input item description, label the item as one of the following: ['Household', 'Books', 'Clothing & Accessories', 'Electronics'].
0.8 Design an experiment to categorize the input item descriptions into product types: 'Household', 'Books', 'Clothing & Accessories', 'Electronics'. Use a set of predefined criteria and test different approaches to see which yields the highest accuracy.
0.9 List and implement various methods to classify the item descriptions into one of these categories: 'Household', 'Books', 'Clothing & Accessories', 'Electronics'. Evaluate each method's effectiveness and choose the most reliable one.
0.7 Identify and analyze the key factors in the product descriptions that contribute to mislabeling items into categories such as 'Household', 'Books', 'Clothing & Accessories', and 'Electronics'. Adjust the instruction to account for these factors.
0.5 Question the assumption that the provided categories ('Household', 'Books', 'Clothing & Accessories', 

# Critique and refine

## Critique

In [8]:
critique_client = APIClient(url=url, api_key=API_KEY, model="Qwen/Qwen2.5-32B-Instruct")
critique_gen_config = {
    "temperature": 0.7,
    "top_p": 0.9,
    # "top_k": 1,
}
critique_client.load_generation_config(critique_gen_config)





### Evaluate initial prompt and save wrong examples

In [9]:
ca.instructions = initial_instruction
X = df_test['text'].to_list()
y_true = df_test['label'].to_list()
y_pred = ca.run_samples(X)

wrong_examples = []
for i in range(len(y_pred)):
    if y_pred[i] != y_true[i]:
        wrong_examples.append((y_pred[i], y_true[i], X[i]))

  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
from AutoLLM.modules.prompt_critique import PromptCritic

wrong_examples_text = ""
for label_pred, label_true, text in wrong_examples:
    wrong_examples_text += f"""
[Text]:               {text}
[Correct Label]:      {label_true}
[Predicted Label]:    {label_pred}
"""
    


pc = PromptCritic(critique_client)
critique, feedback = pc.critique_prompt(
    task_description="Label E-commerce products as their product types given their product description.",
    seed_instruction=initial_instruction,
    wrong_examples=wrong_examples_text,
)


print(critique)
print(feedback)

LLM thinking: Examining each wrong example and the agent's output, it appears that the current instruction lacks specificity in guiding the agent to accurately differentiate between product types based on the provided descriptions. The agent seems to be making assumptions based on keywords or general themes without a clear understanding of the context or specific product characteristics that would help in making a precise classification.
["The agent predicted 'Electronics' for the Prestige PIC 3.1 V3 2000-Watt Induction Cooktop due to the presence of technical terms such as 'induction', 'touch panel', and 'power saver technology'. These terms might have misled the agent into classifying the item as an electronic device rather than a household appliance.", "For the DeckUp Meritus-S Wall TV Unit, the agent predicted 'Household' because the description includes terms like 'TV Unit' and 'Product Dimensions', which might be more commonly associated with household items than electronic produ

# Refine