In [0]:
dbutils.fs.cp("file:/Workspace/Shared/sales_data_3.csv", "dbfs:/FileStore/streaming/input/sales_data_3.csv")
dbutils.fs.cp("file:/Workspace/Shared/customer_data_2.json", "dbfs:/FileStore/streaming/input/customer_data_2.json")

True

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("StructuredStreamingExample") \
    .getOrCreate()

# Define the schema for the CSV data
sales_schema = "OrderID INT, OrderDate STRING, CustomerID STRING, Product STRING, Quantity INT, Price DOUBLE"

# Read streaming data from CSV files
df_sales_stream = spark.readStream \
    .format("csv") \
    .option("header", "true") \
    .schema(sales_schema) \
    .load("dbfs:/FileStore/streaming/input/")

# Define the schema for the JSON data
customer_schema = "CustomerID STRING, CustomerName STRING, Region STRING, SignupDate STRING"

# Read streaming data from JSON files
df_customers_stream = spark.readStream \
    .format("json") \
    .schema(customer_schema) \
    .load("dbfs:/FileStore/streaming/input/")

df_customers_stream.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- SignupDate: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_date, datediff, to_timestamp

# Transform the sales data: Add a new column for total amount
df_sales_transformed = df_sales_stream.select(
    col("OrderID"),
    to_timestamp(col("OrderDate"), "yyyy-MM-dd HH:mm:ss").alias("OrderDate"), # Convert OrderDate to TIMESTAMP
    col("Product"),
    col("Quantity"),
    col("Price"),
    (col("Quantity") * col("Price")).alias("TotalAmount")
)

print("Applied transformations on sales data...")




Applied transformations on sales data...


In [0]:
# Add watermark to handle late data and perform an aggregation
df_sales_aggregated = df_sales_transformed \
    .withWatermark("OrderDate", "10 days") \
    .groupBy("Product") \
    .agg({"TotalAmount": "sum"})

print("Aggregated sales data by product...")

Aggregated sales data by product...


In [0]:
# Transform the customer data: Add a new column for the number of years since signup
df_customers_transformed = df_customers_stream.withColumn(
    "YearsSinceSignup",
    datediff(current_date(), to_timestamp(col("SignupDate"), "yyyy-MM-dd")).cast("int") / 365
)

print("Applied transformations on customer data...")

Applied transformations on customer data...


In [0]:
# Write the aggregated sales data to a console sink for debugging
sales_query = df_sales_aggregated.writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

print("Started streaming query to write aggregated sales data to console...")



Started streaming query to write aggregated sales data to console...


In [0]:
# Write the transformed customer data to a console sink for debugging
customers_query = df_customers_transformed.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

print("Started streaming query to write transformed customer data to console...")

Started streaming query to write transformed customer data to console...
