# TPC-H Dataset Generator — Pure PySpark (Serverless Compatible)

Generates the full **TPC-H** benchmark dataset **entirely in PySpark** — no shell commands, no `dbgen` binary, no access to `dbfs:/tmp/` required.

| Table | SF-1 Rows | SF-100 Rows |
|-------|-----------|-------------|
| REGION | 5 | 5 |
| NATION | 25 | 25 |
| SUPPLIER | 10,000 | 1,000,000 |
| PART | 200,000 | 20,000,000 |
| PARTSUPP | 800,000 | 80,000,000 |
| CUSTOMER | 150,000 | 15,000,000 |
| ORDERS | 1,500,000 | 150,000,000 |
| LINEITEM | ~6,000,000 | ~600,000,000 |

**Storage**: Unity Catalog managed Delta tables (no external paths).

Start with `SCALE_FACTOR = 1` to validate, then bump to 10 or 100.

## Cell 1 — Configuration

In [None]:
# ── Configuration ─────────────────────────────────────────────────────────────
# Uses whatever catalog is currently set on the cluster / session.
# SCALE_FACTOR controls data volume:  1 ≈ 1 GB, 10 ≈ 10 GB, 100 ≈ 100 GB.
# To generate 100 GB, just change SCALE_FACTOR to 100 and re-run all cells.

CATALOG      = spark.catalog.currentCatalog()   # auto-detect current catalog
SCHEMA       = "tpch"                            # schema that will hold the tables
SCALE_FACTOR = 1                                 # 1 GB to start; change to 100 for full scale

# Derived row counts per TPC-H spec
SF = SCALE_FACTOR
N_SUPPLIER  = 10_000   * SF
N_PART      = 200_000  * SF
N_PARTSUPP  = 800_000  * SF   # 4 records per part
N_CUSTOMER  = 150_000  * SF
N_ORDERS    = 1_500_000 * SF
# LINEITEM is ~4x ORDERS on average (generated per-order below)

full_schema = f"{CATALOG}.{SCHEMA}"

print(f"Current catalog: {CATALOG}")
print(f"Target schema:   {full_schema}  |  Scale Factor: {SF}")
print(f"Expected rows — SUPPLIER: {N_SUPPLIER:,}  PART: {N_PART:,}  CUSTOMER: {N_CUSTOMER:,}  ORDERS: {N_ORDERS:,}")

## Cell 2 — Create Catalog & Schema

In [None]:
# Catalog already exists (we detected it via currentCatalog), so just ensure the schema.
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {full_schema}")
spark.sql(f"USE {full_schema}")
print(f"Using {full_schema}")

## Cell 3 — Imports & Helpers

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, LongType, StringType, DoubleType, DateType,
)

# Deterministic random helper (seed-based for reproducibility)
def rand_int(seed, low, high):
    """Return a Column expression: random int in [low, high)."""
    return (F.rand(seed) * (high - low) + low).cast(IntegerType())

def rand_double(seed, low, high, decimals=2):
    """Return a Column expression: random double in [low, high), rounded."""
    return F.round(F.rand(seed) * (high - low) + low, decimals)

def pick_one(seed, arr_expr):
    """Pick a random element from a SQL array literal."""
    return F.expr(f"element_at({arr_expr}, int(rand({seed}) * size({arr_expr})) + 1)")

def write_table(df, table_name):
    """Write a DataFrame as a managed Delta table (overwrite)."""
    fqn = f"{full_schema}.{table_name}"
    df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(fqn)
    cnt = spark.table(fqn).count()
    print(f"  ✓ {fqn} — {cnt:,} rows")
    return cnt

print("Helpers ready.")

## Cell 4 — REGION (5 rows, static)

In [None]:
region_data = [
    (0, "AFRICA",      "Special offers for African customers."),
    (1, "AMERICA",     "Customers from the Americas region."),
    (2, "ASIA",        "Customers from the Asia-Pacific region."),
    (3, "EUROPE",      "Customers from the European region."),
    (4, "MIDDLE EAST", "Customers from the Middle East region."),
]

region_schema = StructType([
    StructField("r_regionkey", IntegerType(), False),
    StructField("r_name",      StringType(), False),
    StructField("r_comment",   StringType(), True),
])

df_region = spark.createDataFrame(region_data, schema=region_schema)
write_table(df_region, "region")

## Cell 5 — NATION (25 rows, static)

In [None]:
nation_data = [
    ( 0, "ALGERIA",        0), ( 1, "ARGENTINA",      1), ( 2, "BRAZIL",         1),
    ( 3, "CANADA",         1), ( 4, "EGYPT",          4), ( 5, "ETHIOPIA",       0),
    ( 6, "FRANCE",         3), ( 7, "GERMANY",        3), ( 8, "INDIA",          2),
    ( 9, "INDONESIA",      2), (10, "IRAN",           4), (11, "IRAQ",           4),
    (12, "JAPAN",          2), (13, "JORDAN",         4), (14, "KENYA",          0),
    (15, "MOROCCO",        0), (16, "MOZAMBIQUE",     0), (17, "PERU",           1),
    (18, "CHINA",          2), (19, "ROMANIA",        3), (20, "SAUDI ARABIA",   4),
    (21, "VIETNAM",        2), (22, "RUSSIA",         3), (23, "UNITED KINGDOM", 3),
    (24, "UNITED STATES",  1),
]

nation_schema = StructType([
    StructField("n_nationkey", IntegerType(), False),
    StructField("n_name",      StringType(), False),
    StructField("n_regionkey", IntegerType(), False),
])

df_nation = spark.createDataFrame(nation_data, schema=nation_schema)
df_nation = df_nation.withColumn("n_comment", F.concat(F.lit("Nation comment for "), F.col("n_name")))
write_table(df_nation, "nation")

## Cell 6 — SUPPLIER

In [None]:
df_supplier = (
    spark.range(1, N_SUPPLIER + 1)
    .withColumnRenamed("id", "s_suppkey")
    .withColumn("s_name",      F.concat(F.lit("Supplier#"), F.lpad(F.col("s_suppkey").cast("string"), 9, "0")))
    .withColumn("s_address",   F.concat(F.lit("Addr-"), (F.rand(101) * 99999).cast(IntegerType()).cast("string")))
    .withColumn("s_nationkey", rand_int(103, 0, 25))
    .withColumn("s_phone",     F.concat(
                                    (F.col("s_nationkey") + 10).cast("string"), F.lit("-"),
                                    F.lpad((F.rand(107) * 999).cast(IntegerType()).cast("string"), 3, "0"), F.lit("-"),
                                    F.lpad((F.rand(109) * 999).cast(IntegerType()).cast("string"), 3, "0"), F.lit("-"),
                                    F.lpad((F.rand(113) * 9999).cast(IntegerType()).cast("string"), 4, "0"),
                                ))
    .withColumn("s_acctbal",   rand_double(127, -999.99, 9999.99))
    .withColumn("s_comment",   F.concat(F.lit("Supplier comment "), F.col("s_suppkey").cast("string")))
)

write_table(df_supplier, "supplier")

## Cell 7 — PART

In [None]:
# TPC-H part type components
_types  = "array('STANDARD','SMALL','MEDIUM','LARGE','ECONOMY','PROMO')"
_metals = "array('TIN','NICKEL','BRASS','STEEL','COPPER')"
_finish = "array('POLISHED','BURNISHED','PLATED','ANODIZED','BRUSHED')"
_containers = "array('SM CASE','SM BOX','SM PACK','SM PKG','SM JAR','SM BAG','SM CAN','SM DRUM','MED CASE','MED BOX','MED PACK','MED PKG','MED JAR','MED BAG','MED CAN','MED DRUM','LG CASE','LG BOX','LG PACK','LG PKG','LG JAR','LG BAG','LG CAN','LG DRUM','WRAP CASE','WRAP BOX','WRAP PACK','WRAP PKG','WRAP JAR','WRAP BAG','WRAP CAN','WRAP DRUM','JUMBO CASE','JUMBO BOX','JUMBO PACK','JUMBO PKG','JUMBO JAR','JUMBO BAG','JUMBO CAN','JUMBO DRUM')"
_brands = "array('Brand#11','Brand#12','Brand#13','Brand#14','Brand#15','Brand#21','Brand#22','Brand#23','Brand#24','Brand#25','Brand#31','Brand#32','Brand#33','Brand#34','Brand#35','Brand#41','Brand#42','Brand#43','Brand#44','Brand#45','Brand#51','Brand#52','Brand#53','Brand#54','Brand#55')"
_colors = "array('almond','antique','aquamarine','azure','beige','bisque','black','blanched','blue','blush','brown','burlywood','burnished','chartreuse','chiffon','chocolate','coral','cornflower','cornsilk','cream','cyan','dark','deep','dim','dodger','drab','firebrick','floral','forest','frosted','gainsboro','ghost','goldenrod','green','grey','honeydew','hot','indian','ivory','khaki','lace','lavender','lawn','lemon','light','lime','linen','magenta','maroon','medium','metallic','midnight','mint','misty','moccasin','navajo','navy','olive','orange','orchid','pale','papaya','peach','peru','pink','plum','powder','puff','purple','red','rosy','royal','saddle','salmon','sandy','seashell','sienna','sky','slate','smoke','snow','spring','steel','tan','thistle','tomato','turquoise','violet','wheat','white','yellow')"

df_part = (
    spark.range(1, N_PART + 1)
    .withColumnRenamed("id", "p_partkey")
    .withColumn("p_name", F.concat_ws(" ",
                    pick_one(201, _colors), pick_one(202, _colors),
                    pick_one(203, _colors), pick_one(204, _colors),
                    pick_one(205, _colors)))
    .withColumn("p_mfgr",        F.concat(F.lit("Manufacturer#"), (rand_int(211, 1, 6)).cast("string")))
    .withColumn("p_brand",       pick_one(213, _brands))
    .withColumn("p_type",        F.concat_ws(" ", pick_one(215, _types), pick_one(217, _metals), pick_one(219, _finish)))
    .withColumn("p_size",        rand_int(221, 1, 51))
    .withColumn("p_container",   pick_one(223, _containers))
    .withColumn("p_retailprice", F.round(
                                    (900.0 + (F.col("p_partkey") / 10) % F.lit(200.01)
                                     + (F.col("p_partkey") % F.lit(1000)) * F.lit(0.01)), 2))
    .withColumn("p_comment",     F.concat(F.lit("Part comment "), F.col("p_partkey").cast("string")))
)

write_table(df_part, "part")

## Cell 8 — PARTSUPP (4 suppliers per part)

In [None]:
# Each part has 4 suppliers → cross with array(0,1,2,3)
df_partsupp = (
    spark.range(1, N_PART + 1)
    .withColumnRenamed("id", "ps_partkey")
    .withColumn("_idx", F.explode(F.array(*[F.lit(i) for i in range(4)])))
    .withColumn("ps_suppkey",    ((F.col("ps_partkey") + F.col("_idx") * (N_PART // 4 + 1)) % F.lit(N_SUPPLIER) + 1).cast(IntegerType()))
    .withColumn("ps_availqty",   rand_int(301, 1, 10000))
    .withColumn("ps_supplycost", rand_double(303, 1.0, 1000.0))
    .withColumn("ps_comment",    F.concat(F.lit("PS comment "), F.col("ps_partkey").cast("string"), F.lit("-"), F.col("_idx").cast("string")))
    .drop("_idx")
)

write_table(df_partsupp, "partsupp")

## Cell 9 — CUSTOMER

In [None]:
_segments = "array('AUTOMOBILE','BUILDING','FURNITURE','HOUSEHOLD','MACHINERY')"

df_customer = (
    spark.range(1, N_CUSTOMER + 1)
    .withColumnRenamed("id", "c_custkey")
    .withColumn("c_name",       F.concat(F.lit("Customer#"), F.lpad(F.col("c_custkey").cast("string"), 9, "0")))
    .withColumn("c_address",    F.concat(F.lit("CustAddr-"), (F.rand(401) * 99999).cast(IntegerType()).cast("string")))
    .withColumn("c_nationkey",  rand_int(403, 0, 25))
    .withColumn("c_phone",      F.concat(
                                    (F.col("c_nationkey") + 10).cast("string"), F.lit("-"),
                                    F.lpad((F.rand(407) * 999).cast(IntegerType()).cast("string"), 3, "0"), F.lit("-"),
                                    F.lpad((F.rand(409) * 999).cast(IntegerType()).cast("string"), 3, "0"), F.lit("-"),
                                    F.lpad((F.rand(411) * 9999).cast(IntegerType()).cast("string"), 4, "0"),
                                ))
    .withColumn("c_acctbal",    rand_double(421, -999.99, 9999.99))
    .withColumn("c_mktsegment", pick_one(431, _segments))
    .withColumn("c_comment",    F.concat(F.lit("Customer comment "), F.col("c_custkey").cast("string")))
)

write_table(df_customer, "customer")

## Cell 10 — ORDERS

In [None]:
_priorities = "array('1-URGENT','2-HIGH','3-MEDIUM','4-NOT SPECIFIED','5-LOW')"

# TPC-H order dates span 1992-01-01 to 1998-08-02 (~2405 days)
_order_start = F.to_date(F.lit("1992-01-01"))
_order_days  = 2405

df_orders = (
    spark.range(1, N_ORDERS + 1)
    .withColumnRenamed("id", "o_orderkey")
    .withColumn("o_custkey",       (F.rand(501) * N_CUSTOMER).cast(IntegerType()) + 1)
    .withColumn("o_orderstatus",   F.expr("element_at(array('O','F','P'), int(rand(503)*3)+1)"))
    .withColumn("o_totalprice",    rand_double(505, 900.0, 550000.0))
    .withColumn("o_orderdate",     F.date_add(_order_start, rand_int(507, 0, _order_days)))
    .withColumn("o_orderpriority", pick_one(509, _priorities))
    .withColumn("o_clerk",         F.concat(F.lit("Clerk#"), F.lpad((F.rand(511) * (N_SUPPLIER / 10)).cast(IntegerType()).cast("string"), 9, "0")))
    .withColumn("o_shippriority",  F.lit(0))
    .withColumn("o_comment",       F.concat(F.lit("Order comment "), F.col("o_orderkey").cast("string")))
)

write_table(df_orders, "orders")

## Cell 11 — LINEITEM (largest table, ~4× ORDERS)

In [None]:
# Each order gets between 1 and 7 line items (avg ~4).
# We generate by exploding a random line-count per order.

_shipmodes  = "array('REG AIR','AIR','RAIL','SHIP','TRUCK','MAIL','FOB')"
_instructs  = "array('DELIVER IN PERSON','COLLECT COD','NONE','TAKE BACK RETURN')"
_rflag      = "array('R','A','N')"
_lstatus    = "array('O','F')"

# Start from orders to inherit o_orderkey and o_orderdate
df_lineitem = (
    spark.table(f"{full_schema}.orders")
    .select("o_orderkey", "o_orderdate")
    # Random number of line items per order (1 to 7)
    .withColumn("_num_lines", rand_int(601, 1, 8))
    .withColumn("l_linenumber", F.explode(F.sequence(F.lit(1), F.col("_num_lines"))))
    .drop("_num_lines")
    .withColumnRenamed("o_orderkey", "l_orderkey")
    # Foreign keys
    .withColumn("l_partkey",       (F.rand(603) * N_PART).cast(IntegerType()) + 1)
    .withColumn("l_suppkey",       (F.rand(605) * N_SUPPLIER).cast(IntegerType()) + 1)
    # Quantities and pricing
    .withColumn("l_quantity",      rand_int(607, 1, 51).cast(DoubleType()))
    .withColumn("l_extendedprice", F.round(F.col("l_quantity") * rand_double(609, 1.0, 1000.0), 2))
    .withColumn("l_discount",      F.round(F.rand(611) * 0.10, 2))
    .withColumn("l_tax",           F.round(F.rand(613) * 0.08, 2))
    # Status flags
    .withColumn("l_returnflag",    pick_one(615, _rflag))
    .withColumn("l_linestatus",    pick_one(617, _lstatus))
    # Dates: shipdate 1-121 days after orderdate, commitdate 30-90, receiptdate 1-30 after ship
    .withColumn("l_shipdate",      F.date_add(F.col("o_orderdate"), rand_int(619, 1, 122)))
    .withColumn("l_commitdate",    F.date_add(F.col("o_orderdate"), rand_int(621, 30, 91)))
    .withColumn("l_receiptdate",   F.date_add(F.col("l_shipdate"),  rand_int(623, 1, 31)))
    .withColumn("l_shipinstruct",  pick_one(625, _instructs))
    .withColumn("l_shipmode",      pick_one(627, _shipmodes))
    .withColumn("l_comment",       F.concat(F.lit("LI comment "), F.col("l_orderkey").cast("string"), F.lit("-"), F.col("l_linenumber").cast("string")))
    .drop("o_orderdate")
)

write_table(df_lineitem, "lineitem")

## Cell 12 — Verify Row Counts

In [None]:
tables = ["region", "nation", "supplier", "part", "partsupp", "customer", "orders", "lineitem"]

print(f"{'Table':<12} {'Row Count':>15}")
print("-" * 30)
for t in tables:
    cnt = spark.table(f"{full_schema}.{t}").count()
    print(f"{t:<12} {cnt:>15,}")

## Cell 13 — Quick Preview: LINEITEM sample

In [None]:
display(spark.table(f"{full_schema}.lineitem").limit(10))

## Cell 14 — Quick Preview: ORDERS sample

In [None]:
display(spark.table(f"{full_schema}.orders").limit(10))

## Cell 15 — Sanity Check: TPC-H Query 1 (Pricing Summary Report)

In [None]:
# Classic TPC-H Q1 — validates that data is query-ready
q1 = f"""
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity)                                       AS sum_qty,
    SUM(l_extendedprice)                                  AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount))               AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity)                                       AS avg_qty,
    AVG(l_extendedprice)                                  AS avg_price,
    AVG(l_discount)                                       AS avg_disc,
    COUNT(*)                                              AS count_order
FROM {full_schema}.lineitem
WHERE l_shipdate <= DATE '1998-12-01' - INTERVAL 90 DAY
GROUP BY l_returnflag, l_linestatus
ORDER BY l_returnflag, l_linestatus
"""

display(spark.sql(q1))

---
All 8 TPC-H tables are now in `<catalog>.tpch` as managed Delta tables.

To scale up, change `SCALE_FACTOR` in Cell 1 and re-run all cells.

Continue with `01_bronze_layer.ipynb`.