### This code snippet will generate Order specific json file in the rawpath I mendioned in the code


In [0]:
%python
import random
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType

# Define the schema for the DataFrame
raw_order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", FloatType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])

# Function to generate a single order
def generate_order(order_id_prefix="ORD"):
    order_id = f"{order_id_prefix}-{random.randint(10000, 99999)}"
    customer_id = f"CUST-{random.randint(3000, 7000)}"
    product_id = f"PROD-{random.randint(450, 650)}"
    quantity = random.randint(1, 10)
    price = float(f"{random.uniform(5, 500):.2f}")
    order_timestamp = datetime.now()
    status = random.choice(["PENDING", "COMPLETED", "CANCELLED", "SHIPPED"])
    return (order_id, customer_id, product_id, quantity, price, order_timestamp, status)

# Generate a batch of orders
num_orders = 50
orders_data = [generate_order() for _ in range(num_orders)]

# Create DataFrame
raw_orders_df = spark.createDataFrame(orders_data, schema=raw_order_schema)

# Define a path to save raw data (e.g., DBFS)
#raw_data_path = "dbfs:/FileStore/"
raw_data_path = "abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/"

# Overwrite for initial load, append for subsequent runs
raw_orders_df.write.format("json").mode("overwrite").save(raw_data_path)

print(f"Generated {num_orders} raw orders to: {raw_data_path}batch_1.json")

# Simulate a second batch with some updates and new orders
num_orders_batch2 = 50
orders_data_batch2 = [generate_order() for _ in range(num_orders_batch2 - 10)] # 40 new orders
# Add 10 updates to existing orders (batch_1)
existing_order_ids = raw_orders_df.select("order_id").distinct().limit(10).rdd.map(lambda r: r[0]).collect()
for oid in existing_order_ids:
    # Generate an updated order, changing status or quantity/price
    order_id = oid
    customer_id = f"CUST-{random.randint(4000, 7000)}" # Can be same or different, depends on your key
    product_id = f"PROD-{random.randint(550, 650)}"
    quantity = random.randint(1, 10)
    price = float(f"{random.uniform(5, 500):.2f}")
    order_timestamp = datetime.now()
    status = random.choice(["COMPLETED", "SHIPPED"]) # Update to a completed status
    orders_data_batch2.append((order_id, customer_id, product_id, quantity, price, order_timestamp, status))

raw_orders_df_batch2 = spark.createDataFrame(orders_data_batch2, schema=raw_order_schema)
raw_orders_df_batch2.write.format("json").mode("overwrite").save(raw_data_path)
print(f"Generated {num_orders_batch2} raw orders (including updates) to: {raw_data_path}batch_2.json")