In [0]:
%pip install faker 

In [0]:
from faker import Faker
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import random 

faker = Faker()
Faker.seed(42) 
 
@F.udf(StringType())
def fake_country(): return faker.country_code(representation="alpha-2")

@F.udf(StringType())
def fake_state(): return faker.state_abbr()

@F.udf(StringType())
def fake_city(): return faker.city()

@F.udf(StringType())
def fake_postal(): return faker.postcode()

@F.udf(StringType())
def fake_street(): return faker.street_address() 
def sample_n_rows(df: DataFrame, n: int, seed: int = None) -> DataFrame:
    """
    Returns a DataFrame containing n random rows.
    
    :param df: Source Spark DataFrame
    :param n: Number of random rows to sample
    :param seed: Optional random seed for reproducibility
    :return: Sampled DataFrame
    """
    if seed is not None:
        return df.select("*", F.rand(seed).alias('rand')).orderBy('rand').limit(n).drop('rand')
    else:
        return df.select("*", F.rand().alias('rand')).orderBy('rand').limit(n).drop('rand')


Generate test data for customers

In [0]:
df = spark.read.csv("s3://de-40-training-raw/final_exam_data/customers/customers.csv", header=True) 
df.schema 
# Assume df is your existing DataFrame
sampled_df = sample_n_rows(df, 10)  # Take 10 random rows

sampled_df.display()
mocked_df = sampled_df.select('CUSTOMER_ID', 'CUST_FIRST_NAME', 'CUST_LAST_NAME', F.lit('US').alias('CUST_ADDRESS.COUNTRY_ID'), fake_state().alias('CUST_ADDRESS.STATE_PROVINCE'), fake_city().alias('CUST_ADDRESS.CITY'), fake_postal().alias('CUST_ADDRESS.POSTAL_CODE'), fake_street().alias('CUST_ADDRESS.STREET_ADDRESS'), 'PHONE_NUMBER', 'CUST_EMAIL', 'ACCOUNT_MGR_ID', 'DATE_OF_BIRTH', 'MARITAL_STATUS', 'GENDER') 
mocked_df.coalesce(1).write.mode('append').format('csv').option("header", True).save("s3://de-40-training-raw/final_exam_data/customers/")

Generate test data for products

In [0]:
product_root_path = "s3://de-40-training-raw/final_exam_data/products/"
df = spark.read.csv(product_root_path + "products.csv", header=True) 

category_list = [
   "hardware1"
  ,"hardware3"
  ,"hardware4"
  ,"hardware2"
]
product_status_list = [
   "obsolete"
  ,"planned"
  ,"under development"
  ,"orderable"
]


@F.udf(StringType())
def fake_price(): return  faker.random_int(min=1, max=999)

@F.udf(returnType=StringType())
def rand_category(): return random.choice(category_list)
@F.udf(returnType=StringType())
def rand_product_status(): return random.choice(product_status_list)

sampled_df = sample_n_rows(df, 10)  # Take 10 random rows 
 
mocked_df = sampled_df.select('PRODUCT_ID', 'PRODUCT_NAME',  rand_category().alias('CATEGORY_NAME'), 'WEIGHT_CLASS', rand_product_status().alias('PRODUCT_STATUS'), fake_price().alias('LIST_PRICE'), fake_price().alias('MIN_PRICE'))
mocked_df.coalesce(1).write.mode('append').format('csv').option("header", True).save(product_root_path)

Generate test data for orders & oreder items

In [0]:
number_of_orders = 10

 
def fake_formatted_date():
    random_date = faker.date_between(start_date='-3y', end_date='today')
    return str(random_date.strftime("%d-%b-%y").upper())  # e.g., '16-AUG-07'

# Register for SQL use
spark.udf.register("fake_formatted_date", fake_formatted_date)


order_modes = ["direct","online"]
order_root = "s3://de-40-training-raw/final_exam_data/orders/"
order_item_root = "s3://de-40-training-raw/final_exam_data/order_items/"
max_order = int(spark.read.format('csv').option("header", True).load(order_root).select("order_id").agg(F.max("order_id")).collect()[0][0])
product_id_list = [3057,1791,2415,3399,3072,2252,3060,3354,3054,2253,3350,1782,2359,1792,1772,3355,2459,3353,3061,2453,3069,2243,2257,2410,2245,2395,3334,2254,2236,1755,3065,2302,1768,3234,3127,3071,3064,3400,3331,1726,2255,1743,1797,2430,2404,3073,2406,3155,2414,2382]

customer_id_list = [169,272,155,154,101,162,146,160,184,171,183,276,179,267,174,274,166,268,147,164,177,102,263,167,270,271,156,144,176,150,170,153,148,180,159,158,266,152,149,165,275,172,181,265,157,175,161,151,262,145]


 

orders_df = spark.read.format('csv').option("header", True).load(order_root+"orders.csv")
orders_df_schema = orders_df.schema 
 

order_items_df = spark.read.format('csv').option("header", True).load(order_item_root+"order_items.csv")
order_items_df_schema = order_items_df.schema 

res_orders = []
res_order_items = []
for i in range(0, number_of_orders):
    order_id = int(max_order) + i
    order_date = str(fake_formatted_date())
    order_mode = random.choice(order_modes)
    customer_id = random.choice(customer_id_list)
    ORDER_STATUS = 0
    order_total = faker.pyfloat(left_digits=3, right_digits=2, positive=True)
    sales_rep_id = -1 
    promotion_id = '-' 
    LINE_ITEM_ID = 1
    PRODUCT_ID =  random.choice(product_id_list)
    UNIT_PRICE = faker.pyfloat(left_digits=2, right_digits=2, positive=True)
    QUANTITY = faker.random_int(min=1, max=9)

    temp_tuple_order = (order_id, order_date,order_mode, customer_id, ORDER_STATUS, order_total, sales_rep_id, promotion_id)
    temp_tuple_order_item = (order_id,LINE_ITEM_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY)
    res_orders.append(temp_tuple_order)
    res_order_items.append(temp_tuple_order_item)
print(res_orders)
mocked_orders_df = spark.createDataFrame(res_orders, schema=orders_df.schema)
mocked_orders_df.display()


mocked_order_items_df = spark.createDataFrame(res_order_items, schema=order_items_df_schema)
mocked_order_items_df.display()
mocked_orders_df.coalesce(1).write.format('csv').mode("append").option("header", True).save(order_root)
mocked_order_items_df.coalesce(1).write.format('csv').mode("append").option("header", True).save(order_item_root)
 

In [0]:
print(order_items_df_schema)