In [1]:
%load_ext autoreload
%autoreload 2

# Data

In [2]:
import pandas as pd
from AutoLLM.utils.helpers import split_dataframe
df = pd.read_excel('data/Ecommerce/ecommerce_classification_dataset.xlsx')
df_, df_test = split_dataframe(df, 100 / df.shape[0], 42, stratify_col='label')
_, df_train = split_dataframe(df_, 100 / df_.shape[0], 42, stratify_col='label')

display(df_train)
display(df_test)


Unnamed: 0,label,text
46240,Electronics,CBSE Chemistry Chapterwise Solved Papers Class...
35852,Clothing & Accessories,BREGEO Men's Cotton Blazer This 1 button slim ...
6340,Household,Home Sizzler Shalimar Frill Panel 20 2 Piece E...
30235,Books,Brief Answers to the Big Questions Exclu
45621,Electronics,Artshai Antique Style Nautical Brass Binocular...
...,...,...
25723,Books,"Klapp Skateboard, Colour May Vary (Small) Size..."
11737,Household,ExclusiveLane 'The Serving Hut Goblets' Hand-P...
49120,Electronics,KARWAN Apple iPhone 6 Toughened Glass Back Mob...
42578,Electronics,Tarkan USB Powered Portable Laptop Cooler with...


Unnamed: 0,label,text
32109,Clothing & Accessories,Rovars Unisex All-in-1 Suit Planning a trip to...
21746,Books,Wiley's Physical Chemistry For JEE (Main & Adv...
1214,Household,Ebee Store Shoe Rack with 5 Shelves (Maroon) G...
25733,Books,Strauss Bronx YB Skateboard Skateboard is a ty...
43918,Electronics,Cables Kart™ Digital to Analogue Optical Audio...
...,...,...
24703,Books,New Saraswati Health and Physical Education Cl...
13919,Household,FRESHWORLD Plastic Vacuum Sealer with Starter ...
24624,Books,Current Affairs Yearly 2019
49880,Electronics,IKALL K16 1.8-inch Mobile(White) Colour:White ...


# Parameter setting

In [3]:
from config import API_KEY, NEBIUS_URL
from AutoLLM.interfaces.api_client import APIClient


task_description = "Classify the following items based on the item descriptions."
initial_instruction = "Classify the following item into [Books, Electronics, Clothing & Accessories, Household] based on the item description."

meta_client = APIClient(NEBIUS_URL, API_KEY, model='Qwen/Qwen2.5-32B-Instruct')
meta_config = {"temperature": 0.7, "top_p": 0.9}

# Get reasoning

In [4]:
from tqdm.notebook import tqdm
from AutoLLM.modules.reasoning_agent import ReasoningAgent
from AutoLLM.utils.helpers import append_dict_to_dataframe

reasoning_agent = ReasoningAgent(meta_client, meta_config)

df_reasoning = pd.DataFrame()

for i, r in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    
    input_text = r['text']
    output_text = r['label']
    print(input_text)
    resp = reasoning_agent.run(
        task_description=task_description, 
        input=input_text, 
        output=output_text,
        instruction=initial_instruction,
    )

    error = resp['error']
    reasoning = resp['reasoning']

    print(reasoning)

    if error:
        continue
    new_row = {
        'input': input_text,
        'output': output_text,
        'reasoning': reasoning,
    }
    
    df_reasoning = append_dict_to_dataframe(df_reasoning, new_row)
    # break

df_reasoning


  0%|          | 0/100 [00:00<?, ?it/s]

CBSE Chemistry Chapterwise Solved Papers Class 12th About the Author An editorial team of highly skilled professionals at Arihant, works hand in glove to ensure that the students receive the best and accurate content through our books. From inception till the book comes out from print, the whole team comprising of authors, editors, proofreaders and various other involved in shaping the book put in their best efforts, knowledge and experience to produce the rigorous content the students receive. Keeping in mind the specific requirements of the students and various examinations, the carefully designed exam oriented and exam ready content comes out only after intensive research and analysis. The experts have adopted whole new style of presenting the content which is easily understandable, leaving behind the old traditional methods which once used to be the most effective. They have been developing the latest content & updates as per the needs and requirements of the students making our bo

Unnamed: 0,input,output,reasoning
0,BREGEO Men's Cotton Blazer This 1 button slim ...,Clothing & Accessories,The reasoning chain for classifying the item '...
1,Home Sizzler Shalimar Frill Panel 20 2 Piece E...,Household,The input describes a curtain set which is use...
2,Brief Answers to the Big Questions Exclu,Books,The input 'Brief Answers to the Big Questions ...
3,Artshai Antique Style Nautical Brass Binocular...,Electronics,The task requires the agent to classify the gi...
4,HISCIN Impression PVC Printed Leaf Bathroom Sh...,Household,The item description provided includes details...
...,...,...,...
90,CLIMAX Table Baby VICE REVOLVING CLAMP 40MM [P...,Household,The item description mentions a 'CLIMAX Table ...
91,ExclusiveLane 'The Serving Hut Goblets' Hand-P...,Household,The item description mentions 'Hand-Painted Bo...
92,KARWAN Apple iPhone 6 Toughened Glass Back Mob...,Electronics,The item description mentions 'Apple iPhone 6 ...
93,Tarkan USB Powered Portable Laptop Cooler with...,Electronics,The item description provided details a USB-po...


# Generalize rules

In [5]:
from AutoLLM.modules.generalize_rules_agent import GeneralizeRulesAgent

generalize_agent = GeneralizeRulesAgent(meta_client, meta_config, verbose=True)

_, df_temp = split_dataframe(df_reasoning, 20/df_reasoning.shape[0], 42, 'output')
reasoning_text = ""
for i, r in df_temp.iterrows():
    reasoning_text += f"- {r['reasoning']}\n"
resp = generalize_agent.run(task_description=task_description, reasoning=reasoning_text)
print(resp)

TypeError: 'type' object is not iterable

In [None]:
rules = resp['rules']

# Generate Synthetic Data

In [None]:
from AutoLLM.modules.synthetic_data_agent import SyntheticDataAgent

synthetic_data_agent = SyntheticDataAgent(meta_client, meta_config)

example_text = ""
count = 0
for i, r in df_reasoning.iterrows():
    example_text += f"input: {r['input']}\noutput: {r['output']}\n\n"
    count += 1
    if count == 10:
        break

print(example_text)

resp = synthetic_data_agent.run(
    task_description=task_description,
    num_variations=5,
    examples=example_text,
    rules=rules,
)

resp

input: BREGEO Men's Cotton Blazer This 1 button slim lapel super slim fit jacket is made with the perfect blend of terry and rayon. That makes it comfortable and stylish at the same time. The jet black color is perfect to carry your evenings in epic style. Size 34, for 34 size waist size must be 28 to 30 inches and stomach size up to 30 inches at navel (max.) and height up to 5ft 5 inches (max.) size 36, for 36 size waist size must be 30 to 32 inches and stomach size up to 32 inches at navel (max.) and height up to 5ft 7 inches (max.) size 38, for 38 size waist size must be 32 to 33 inches and stomach size up to 35 inches at navel (max.) and height up to 5ft8inches (max.) size 40, for 40 size waist size must be 34 to 35 inches and stomach size up to 37 inches at navel (max.) and height up to 5ft10inches (max.)size 42, for 42size waist size, must be 36 to 38 inches and stomach size up to 39 to 40 inches at navel (max.) and height up to 6ft (max.) size 44, for 44 size waist size must be 

{'thinking': 'To generate synthetic inputs, I will consider the categories and rules provided. For each category, I will fabricate a description that includes the necessary elements as per the rules. I will ensure that the descriptions are varied yet maintain the essence of the category they belong to.',
 'synthetic_data': [{'input': 'The Art of War by Sun Tzu This classic military strategy book is a must-have for anyone interested in ancient Chinese military tactics. It delves into the philosophies and strategies of warfare.',
   'output': 'Books',
   'reasoning': "The description includes a title (The Art of War), an author (Sun Tzu), and content related to literature and educational material, fitting the 'Books' category."},
  {'input': 'Bella Decor Waterproof Kitchen Backsplash Tiles These tiles are designed for your kitchen backsplash, made with a waterproof material that resists moisture and stains. They come in a variety of patterns to enhance your kitchen decor.',
   'output': 

In [None]:
assert False

AssertionError: 

In [None]:
example_text = ""
sample_df = df.sample(10, random_state=42)
for i, r in sample_df.iterrows():
    example_text += f"""[input]: {r['text']}\n[output]: {r['label']}\n\n"""
# print(example_text)

resp = sda.run(examples=example_text, num_variations=5)
print(resp)

[{'input': "This men's blazer is made of velvet and comes in a stylish blue shade. It has a simple design, long sleeves, and a button closure, and is suitable for men. It's advised to avoid direct heat and flammable substances to prevent damage.", 'output': 'Clothing & Accessories'}, {'input': 'HealthSense KS 50 Digital Kitchen Scale, Grey. This scale is perfect for precise measurements in the kitchen.', 'output': 'Household'}, {'input': 'Concept of Physics, 2018-2019 edition, a set of two volumes. This book set covers essential physics concepts for students.', 'output': 'Books'}, {'input': "Lista's Multi-Functional Hammer Axe Tool Kit includes a hammer, axe, pliers, knife, screwdriver, can opener, and wood saw. Ideal for home and outdoor activities.", 'output': 'Household'}, {'input': "ADTALA's Solar Security Light with a motion sensor is perfect for outdoor use. It includes a mounting kit and is wireless, providing security for your garden, wall, or path.", 'output': 'Household'}, {'

In [None]:



# Load the ecommerce dataset


# Extract text samples
text_samples = df['Text'].dropna().tolist()

# Initialize the synthetic data agent
agent = SyntheticDataAgent()

# Generate synthetic data
synthetic_data = agent.generate_synthetic_data(text_samples[:100], num_samples=10)

# Save results
output_df = pd.DataFrame({
    'Original_Text': text_samples[:10],
    'Synthetic_Text': synthetic_data
})
output_df.to_csv('synthetic_ecommerce_data.csv', index=False)

print("Generated 10 synthetic examples:")
print(output_df)

# END