# Imports

In [1]:
! pip install pandas numpy -q

import pandas as pd
import numpy as np
import random

# Loading Dataset

In [2]:
df_categories = pd.read_csv('../data/raw/amazon_categories.csv')
df_products = pd.read_csv('../data/processed/amazon_products_cleaned.csv')

print("Amazon Categories:\n", df_categories.head())
print("Amazon Products:\n", df_products.head())

Amazon Categories:
    id                     category_name
0   1          Beading & Jewelry Making
1   2                 Fabric Decorating
2   3       Knitting & Crochet Supplies
3   4              Printmaking Supplies
4   5  Scrapbooking & Stamping Supplies
Amazon Products:
          asin                                              title  \
0  B07BM9CTPF  Christian Religious Rubber Bracelet with Card,...   
1  B07M8GSBRY  2 Pcs Mickey Ears, Minnie Costume Ears Headban...   
2  B000TZQXXO  35 Light 13.5' Green Wire Red Chili Pepper String   
3  B07SHDN9PX  DeYeShiKi Motorcycle Mirrors ATV Mirrors, 360 ...   
4  B09HQQC8CT  Cash Only Sign - 8 x 12 Aluminum - No Credit C...   

                                              imgUrl  \
0  https://m.media-amazon.com/images/I/61AGW4S9Yz...   
1  https://m.media-amazon.com/images/I/71EJyf6JpE...   
2  https://m.media-amazon.com/images/I/613hslJHG1...   
3  https://m.media-amazon.com/images/I/61YUQzpAeV...   
4  https://m.media-amazon.com/ima

# Synthetic QA Generation
Defines templates and logic to create Question-Answer pairs from structured product data.

In [5]:
# Define templates for synthetic questions
question_templates = {
    "ask_price": [
        "What is the price?", "How much does it cost?", "What's the price tag?",
        "Can you tell me the price?", "How expensive is it?"
    ],
    "ask_rating": [
        "What is the rating?", "How many stars does it have?", "Is it highly rated?",
        "What do users think about it?", "Star rating?"
    ],
    "ask_title": [
        "What is the product name?", "What is this item?", "Tell me the title.",
        "Name of the product?"
    ],
    "ask_bestseller": [
        "Is this a best seller?", "Is it a top selling item?", "Is it popular?"
    ]
}

def generate_synthetic_data(df, num_samples=5000):
    qa_pairs = []
    
    # Randomly sample products to keep dataset size manageable initially
    sample_df = df#.sample(n=min(len(df), num_samples), random_state=42)

    for _, row in sample_df.iterrows():
        # Create a "Context" string that represents the product info available to the model
        # The model will eventually take this context + question to predict the answer.
        title = row.get('title', 'Unknown Product')
        price = row.get('price', 'Unknown Price')
        stars = row.get('stars', '0')
        
        context = f"Item: {title} | Cost: {price} | Rating: {stars}"

        # --- Generate Price QA ---
        if pd.notna(price) and price != 0:
            q_text = random.choice(question_templates["ask_price"])
            a_text = str(price)
            # We add 'context' so the model knows what product we are talking about
            qa_pairs.append({"context": context, "question": q_text, "answer": a_text, "type": "price"})

        # --- Generate Rating QA ---
        if pd.notna(stars):
            q_text = random.choice(question_templates["ask_rating"])
            a_text = f"{stars} stars"
            qa_pairs.append({"context": context, "question": q_text, "answer": a_text, "type": "rating"})

        # --- Generate Title QA ---
        if pd.notna(title):
            q_text = random.choice(question_templates["ask_title"])
            a_text = title
            qa_pairs.append({"context": context, "question": q_text, "answer": a_text, "type": "title"})
            
        # --- Generate Best Seller QA ---
        if 'isBestSeller' in row:
             q_text = random.choice(question_templates["ask_bestseller"])
             a_text = "Yes" if row['isBestSeller'] else "No"
             qa_pairs.append({"context": context, "question": q_text, "answer": a_text, "type": "bestseller"})

    return pd.DataFrame(qa_pairs)

In [6]:
# Run generation
print("Generating synthetic QA pairs...")
# Using 2000 products to generate roughly 6000-8000 QA pairs
df_qa = generate_synthetic_data(df_products, num_samples=2000)

# Preview
print(f"Generated {len(df_qa)} QA pairs.")
print(df_qa[['question', 'answer']].head(10))

# Save
output_path = '../data/processed/qa_dataset.csv'
df_qa.to_csv(output_path, index=False)
print(f"Saved processed QA dataset to {output_path}")

Generating synthetic QA pairs...
Generated 397692 QA pairs.
                    question  \
0      What's the price tag?   
1        What is the rating?   
2       Name of the product?   
3  Is it a top selling item?   
4     How much does it cost?   
5        What is the rating?   
6       Name of the product?   
7  Is it a top selling item?   
8         What is the price?   
9               Star rating?   

                                              answer  
0                                              14.99  
1                                          4.3 stars  
2  Christian Religious Rubber Bracelet with Card,...  
3                                                 No  
4                                               8.79  
5                                          4.5 stars  
6  2 Pcs Mickey Ears, Minnie Costume Ears Headban...  
7                                                 No  
8                                              15.99  
9                                    