In [0]:
# Step 1: Install dependencies
%pip install faker tqdm


In [0]:
# Step 2: Imports
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import uuid
from tqdm import tqdm

# Setup
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)
tqdm.pandas()

In [0]:
# Step 3: Configuration
NUM_RECORDS = 400000
GUEST_POOL_SIZE = 50000
anonymous_ratio = 0.1

guest_pool = [str(uuid.uuid4()) for _ in range(GUEST_POOL_SIZE)]

regions = {
    "NA": ["US", "CA"],
    "EMEA": ["GB", "DE", "FR", "NL"],
    "APeC": ["SG", "AU", "IN", "HK"]
}
departments = ["Yoga", "Running", "Outerwear", "Accessories", "Footwear", "Training"]
channels = ["STORE", "ECOM", "APP"]
promo_codes = ["WINTER20", "BLACKFRIDAY", "CYBER15", "NEWYEAR10", None, None, None]
coupon_descriptions = {
    "WINTER20": "20% off Winter Collection",
    "BLACKFRIDAY": "Black Friday Special",
    "CYBER15": "Cyber Monday Deal",
    "NEWYEAR10": "New Year Offer",
    None: None
}

end_date = datetime.today()
start_date = end_date - timedelta(days=365)

In [0]:
# Step 4: Data generation function
def generate_transaction_record():
    txn_date = fake.date_time_between(start_date=start_date, end_date=end_date)
    market_region = random.choice(list(regions.keys()))
    selling_market = random.choice(regions[market_region])

    is_linked_guest = random.random() > anonymous_ratio
    guest_id = random.choice(guest_pool) if is_linked_guest else None
    master_guest_id = guest_id if is_linked_guest else None

    quantity = np.random.randint(1, 4)
    unit_price = round(np.random.uniform(20, 150), 2)
    discount_pct = np.random.choice([0, 10, 15, 20, 25], p=[0.5, 0.2, 0.15, 0.1, 0.05])
    discount_amt = round(unit_price * quantity * (discount_pct / 100), 2)
    extended_price = round(unit_price * quantity - discount_amt, 2)
    gross_margin = round(extended_price * np.random.uniform(0.3, 0.6), 2)

    coupon_code = random.choice(promo_codes)
    coupon_description = coupon_descriptions[coupon_code]

    return {
        "TRANSACTION_ID": str(uuid.uuid4()),
        "TXN_TIMESTAMP": txn_date,
        "TXN_DATE_ID": int(txn_date.strftime("%Y%m%d")),
        "MASTER_GUEST_ID": master_guest_id,
        "GUEST_ID": guest_id,
        "LOCATION_ID": np.random.randint(1000, 2000),
        "SELLING_MARKET": selling_market,
        "SKU": np.random.randint(100000, 999999),
        "MMS_DEPT_NAME": random.choice(departments),
        "QUANTITY": quantity,
        "UNIT_REGULAR_PRICE_USD": unit_price,
        "EXTENDED_DISCOUNT_USD": discount_amt,
        "DISCOUNT_PCT": discount_pct,
        "EXTENDED_PRICE_USD": extended_price,
        "GROSS_MARGIN_AMT_USD": gross_margin,
        "COUPON_CODE": coupon_code,
        "COUPON_DESCRIPTION": coupon_description,
        "MASTER_ORDER_ORIGIN": random.choice(channels)
    }


In [0]:
# Step 5: Generate the dataset
print("Generating 400K records...")
data = [generate_transaction_record() for _ in tqdm(range(NUM_RECORDS))]
df_transactions = pd.DataFrame(data)
display(df_transactions.head())

In [0]:
# Step 6: Convert to Spark and save to Bronze schema
df_spark = spark.createDataFrame(df_transactions)
df_spark.write.mode("overwrite").saveAsTable("bits_pilani.bronze_sch.synthetic_retail_transactions")
print("✅ Data written to table: bronze.synthetic_retail_transactions")
