Step 1
Generate Synthetic Data for Bronze Layer

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import datetime

# Define schema for the sales data
sales_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("sale_date", TimestampType(), True),
    StructField("sale_amount", DoubleType(), True)
])

# Generate synthetic sales data
sales_data = [
    ("user1", "productA", datetime.datetime(2024, 11, 5), 25.99),
    ("user2", "productB", datetime.datetime(2024, 11, 5), 15.49),
    ("user3", "productC", datetime.datetime(2024, 11, 5), 35.00),
    ("user4", "productA", datetime.datetime(2024, 11, 5), 25.99),
    ("user1", "productD", datetime.datetime(2024, 11, 6), 45.00),
]

# Create DataFrame with the correct schema
sales_df = spark.createDataFrame(sales_data, schema=sales_schema)

# Save this DataFrame as the sales table in the Bronze layer
sales_df.write.mode("overwrite").format("parquet").saveAsTable("sales")


Clean and Transform Sales Data in the Silver Layer

In [0]:
from pyspark.sql.functions import to_timestamp

# Load the sales data from Bronze layer
bronze_sales_df = spark.table("sales")

# Clean the sales data
silver_sales_df = (bronze_sales_df
                   .dropDuplicates()
                   .withColumn("sale_date", to_timestamp("sale_date")))  # Ensure sale_date is a timestamp

# Save the cleaned sales data to the Silver layer
silver_sales_df.write.mode("overwrite").format("parquet").saveAsTable("cleaned_sales")


 Aggregate Sales Data in the Gold Layer

In [0]:
from pyspark.sql.functions import sum

# Load the cleaned sales data from Silver layer
silver_sales_df = spark.table("cleaned_sales")

# Calculate daily total sales
daily_sales_df = (silver_sales_df
                  .groupBy("sale_date")
                  .agg(sum("sale_amount").alias("total_sales")))

# Save the aggregated sales data to the Gold layer
daily_sales_df.write.mode("overwrite").format("parquet").saveAsTable("daily_sales")
