In [2]:
# Cell 1: Import Libraries

# Import libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import ollama  # Optional: for generating product descriptions
import os
import json

# Initialize Faker
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)



# Create directories if they don't exist
os.makedirs('data', exist_ok=True)
os.makedirs('output', exist_ok=True)

In [3]:
# Cell 2a: Generate Amazon Procut Data (Non-LLM)

# Function to generate Amazon-style Order ID
def generate_order_id():
    """Generate a unique Amazon-style Order ID (e.g., 123-4567890-1234567)."""
    return f"{random.randint(100, 999)}-{random.randint(1000000, 9999999)}-{random.randint(1000000, 9999999)}"

# Function to generate ASIN (Amazon Standard Identification Number)
def generate_asin():
    """Generate a 10-character ASIN."""
    return ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=10))

In [4]:
import json
import time

# Function to generate product using Ollama
def generate_product(amount: float, category: str) -> dict:
    """Generate a product name and description using the Ollama mistral:7b-instruct-v0.3-q4_0 model.
    Args:
        amount (float): Placeholder for potential pricing (not used currently).
        category (str): Category to guide product generation (e.g., 'electronics', 'home').
    Returns:
        dict: Contains 'product_name' and 'description'.
    """
    prompt = f"""
    Generate a name of a product that can be sold online in the {category} category.
    Rules:
    - Use title case.
    - No punctuation.
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - Use simple phrases.
    - Examples: 'LED Light Bulbs', 'Bath Towel'
    Additionally, provide a brief description (up to 50 characters) of the product.
    Return the output as a JSON object with 'product_name' and 'description' fields, wrapped in triple backticks (```json\n{{}}\n```).
    Example:
    ```json
    {{
        "product_name": "LED Light Bulbs",
        "description": "Bright energy saving lights"
    }}
    """
    response = ollama.generate(
        model='mistral:7b-instruct-v0.3-q4_0',
        prompt=prompt,
        options={"temperature": 0.8}
    )
    raw_response = response['response'].strip()
    print(f"Full raw LLM response for product: {raw_response}")  # Show full response for debugging
    if not raw_response:
        raise ValueError("Empty response from Ollama")
    if raw_response.startswith('```json'):
        raw_response = raw_response[7:].split('```', 1)[0].strip()  # Take only the first JSON block
    try:
        result = json.loads(raw_response)
        if 'product_name' not in result or 'description' not in result:
            raise ValueError("Ollama response missing 'product_name' or 'description' key")
        result['description'] = result['description'][:50]  # Truncate description to 50 characters
        return result
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON: {e}. Raw response: {raw_response}")
        raise

# Generate a list of product names to use in orders with minimized interference
num_products = 5  # Increased to ensure variety
product_categories = ['electronics', 'home goods', 'beauty', 'clothing', 'outdoor gear']  # Diverse categories
product_names = []
seen_names = set()  # Track unique names to avoid duplicates
start_time = time.time()
print("Starting LLM generation process...")
while len(product_names) < num_products:
    # Pause briefly to reduce contention with other processes
    time.sleep(0.1)
    # Select a random category to increase diversity
    category = random.choice(product_categories)
    product = generate_product(0.0, category)  # amount is a placeholder, not used
    product_name = product['product_name']
    if product_name not in seen_names:
        seen_names.add(product_name)
        product_names.append(product_name)
end_time = time.time()
print(f"Generated {len(product_names)} unique product names in {end_time - start_time:.2f} seconds: {product_names[:5]}...")

Starting LLM generation process...
Full raw LLM response for product: ```json
    {
        "product_name": "Portable Wireless Charger",
        "description": "Fast, convenient charging for your devices"
    }
    ```
Full raw LLM response for product: ```json
    {
        "product_name": "Wireless Bluetooth Speaker",
        "description": "Compact, high-quality sound device for music streaming"
    }
    ```
Full raw LLM response for product: ```json
    {
        "product_name": "Glow Essence Serum",
        "description": "Radiate youthful glow with this hydrating serum"
    }
   ```
Full raw LLM response for product: ```json
    {
        "product_name": "Ceramic Mug Set",
        "description": "Durable, microwave-safe mugs for your favorite beverages"
    }
    ```
Full raw LLM response for product: ```json
    {
        "product_name": "Modern Kitchen Rack",
        "description": "Sleek and sturdy storage solution for your pots and pans"
    }
    ```
Generated 5 unique prod

In [5]:
# Cell 3: Generate Synthetic Order Data

def generate_orders(n=100):
    """Generate a list of synthetic Amazon orders using pre-generated product names."""
    orders = []
    for _ in range(n):
        order_id = generate_order_id()
        order_date = fake.date_time_between(start_date="-1y", end_date=datetime.now())
        customer_name = fake.name()
        customer_email = fake.email()
        shipping_address = fake.address().replace("\n", ", ")
        num_items = random.randint(1, 5)

        # Generate items using pre-generated product names
        items = []
        subtotal = 0
        for _ in range(num_items):
            product_name = random.choice(product_names)  # Use pre-generated names
            asin = generate_asin()
            price = round(random.uniform(5.99, 199.99), 2)
            quantity = random.randint(1, 3)
            items.append({'product_name': product_name, 'asin': asin, 'price': price, 'quantity': quantity})
            subtotal += price * quantity

        # Shipping and tax
        shipping_method = random.choice(["Prime 2-Day", "Standard Shipping", "Expedited Shipping"])
        shipping_cost = 0 if shipping_method == "Prime 2-Day" else round(random.uniform(3.99, 12.99), 2)
        tax = round(subtotal * random.uniform(0.05, 0.1), 2)
        total = round(subtotal + shipping_cost + tax, 2)
        status = random.choice(["Pending", "Shipped", "Delivered"])
        delivery_date = order_date + timedelta(days=random.randint(1, 7)) if status == "Delivered" else None

        orders.append({
            'order_id': order_id,
            'order_date': order_date,
            'customer_name': customer_name,
            'customer_email': customer_email,
            'shipping_address': shipping_address,
            'items': items,
            'subtotal': subtotal,
            'shipping_method': shipping_method,
            'shipping_cost': shipping_cost,
            'tax': tax,
            'total': total,
            'status': status,
            'delivery_date': delivery_date
        })
    return orders

# Generate orders
orders = generate_orders(n=20)

In [6]:
# Cell 4: Generate Pyament Data and Process

def generate_invoices(orders):
    """Generate a list of invoices from orders."""
    invoices = []
    for order in orders:
        invoice_id = f"INV-{order['order_id'].split('-')[1]}"
        invoice_date = order['order_date'] + timedelta(days=random.randint(0, 2))
        billing_address = order['shipping_address']
        payment_method = random.choice(["Credit Card", "Amazon Pay", "Gift Card"])

        invoices.append({
            'invoice_id': invoice_id,
            'order_id': order['order_id'],
            'invoice_date': invoice_date,
            'customer_name': order['customer_name'],
            'billing_address': billing_address,
            'items': order['items'],
            'subtotal': order['subtotal'],
            'shipping_cost': order['shipping_cost'],
            'tax': order['tax'],
            'total': order['total'],
            'payment_method': payment_method
        })
    return invoices

def flatten_data(data, data_type):
    """Flatten nested items into a tabular format for orders or invoices."""
    flattened = []
    for record in data:
        for item in record['items']:
            flat_record = {
                'id': record['order_id'] if data_type == 'orders' else record['invoice_id'],
                'order_id': record['order_id'],
                'date': record['order_date'] if data_type == 'orders' else record['invoice_date'],
                'customer_name': record['customer_name'],
                'address': record['shipping_address'] if data_type == 'orders' else record['billing_address'],
                'product_name': item['product_name'],
                'asin': item['asin'],
                'price': item['price'],
                'quantity': item['quantity'],
                'subtotal': record['subtotal'],
                'shipping_cost': record['shipping_cost'],
                'tax': record['tax'],
                'total': record['total'],
                'status': record.get('status') if data_type == 'orders' else record['payment_method']
            }
            flattened.append(flat_record)
    return flattened

# Generate invoices
invoices = generate_invoices(orders)

# Flatten and create DataFrames
orders_flat = flatten_data(orders, 'orders')
invoices_flat = flatten_data(invoices, 'invoices')
orders_df = pd.DataFrame(orders_flat)
invoices_df = pd.DataFrame(invoices_flat)

# Save to CSV in data folder
orders_df.to_csv('data/orders.csv', index=False)
invoices_df.to_csv('data/invoices.csv', index=False)

In [7]:
# Cell 4a: Check DataFrame

print("Sample Orders Data:")
display(orders_df.head())
print("\nSample Invoices Data:")
display(invoices_df.head())

Sample Orders Data:


Unnamed: 0,id,order_id,date,customer_name,address,product_name,asin,price,quantity,subtotal,shipping_cost,tax,total,status
0,242-2719583-2458591,242-2719583-2458591,2024-10-06 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,BDIVUZZPQK,174.63,1,651.19,11.88,42.81,705.88,Delivered
1,242-2719583-2458591,242-2719583-2458591,2024-10-06 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,MF8MDD4V30,110.02,2,651.19,11.88,42.81,705.88,Delivered
2,242-2719583-2458591,242-2719583-2458591,2024-10-06 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Portable Wireless Charger,T3W5UZBIKC,51.15,1,651.19,11.88,42.81,705.88,Delivered
3,242-2719583-2458591,242-2719583-2458591,2024-10-06 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,KWNNHJ7XVG,147.44,1,651.19,11.88,42.81,705.88,Delivered
4,242-2719583-2458591,242-2719583-2458591,2024-10-06 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,N9XUY41IBL,57.93,1,651.19,11.88,42.81,705.88,Delivered



Sample Invoices Data:


Unnamed: 0,id,order_id,date,customer_name,address,product_name,asin,price,quantity,subtotal,shipping_cost,tax,total,status
0,INV-2719583,242-2719583-2458591,2024-10-08 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,BDIVUZZPQK,174.63,1,651.19,11.88,42.81,705.88,Credit Card
1,INV-2719583,242-2719583-2458591,2024-10-08 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,MF8MDD4V30,110.02,2,651.19,11.88,42.81,705.88,Credit Card
2,INV-2719583,242-2719583-2458591,2024-10-08 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Portable Wireless Charger,T3W5UZBIKC,51.15,1,651.19,11.88,42.81,705.88,Credit Card
3,INV-2719583,242-2719583-2458591,2024-10-08 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,KWNNHJ7XVG,147.44,1,651.19,11.88,42.81,705.88,Credit Card
4,INV-2719583,242-2719583-2458591,2024-10-08 11:43:16,Shannon Villanueva,"47211 Fletcher Harbors Suite 717, West Patrici...",Ceramic Mug Set,N9XUY41IBL,57.93,1,651.19,11.88,42.81,705.88,Credit Card


In [8]:
# Cell 5: Populate HTML with Synthetic Data

from jinja2 import Template
from datetime import datetime, timedelta
import pdfkit

def calculate_delivery_date(order_date):
    """Calculate estimated delivery date for Express shipping (1-2 days)."""
    return order_date + timedelta(days=random.randint(1, 2))

def html_to_pdf(html_path, pdf_path):
    """Convert HTML file to PDF using pdfkit with a specified wkhtmltopdf path."""
    try:
        config = pdfkit.configuration(wkhtmltopdf=r"C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe")
        pdfkit.from_file(html_path, pdf_path, configuration=config)
        print(f"Converted {html_path} to {pdf_path}")
    except Exception as e:
        print(f"Error converting HTML to PDF: {e}")

# Load HTML template and populate with data
order = orders_df.iloc[0]  # Example: first order
delivery_date = calculate_delivery_date(order['date'])
formatted_delivery_date = f"{delivery_date.strftime('%A, %B %d, %Y')} - {delivery_date.strftime('%A, %B %d, %Y')}"
shipping_speed = "Express"  # Assuming Express based on the image

with open('amazon_order_template.html', 'r') as f:
    html_template = Template(f.read())
html_output = html_template.render(
    order_id=order['order_id'],
    customer_name=order['customer_name'],
    delivery_date=formatted_delivery_date,
    shipping_speed=shipping_speed,
    item_subtotal=f"${order['subtotal']:.2f}",
    shipping_handling=f"${order['shipping_cost']:.2f}",
    total_before_tax=f"${order['subtotal'] + order['shipping_cost']:.2f}",
    estimated_tax=f"${order['tax']:.2f}",
    order_total=f"${order['total']:.2f}"
)

# Save HTML to output folder
html_path = 'output/order_confirmation.html'
with open(html_path, 'w') as f:
    f.write(html_output)
print("Generated", html_path)

# Convert HTML to PDF and save to output folder
pdf_path = 'output/order_confirmation.pdf'
html_to_pdf(html_path, pdf_path)

Generated output/order_confirmation.html
Converted output/order_confirmation.html to output/order_confirmation.pdf
