 # Bronze-Data population from Topic to Silver layer and do cleaning

In [0]:
# Read Delta table from Unity Catalog using spark.read.table()
df = spark.read.table("kafka.bronze.stock_data")
# Show the DataFrame (this will display the data as it exists in Delta format)
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Define the schema for the JSON data inside the "json_message" column
json_schema = StructType([
    StructField("symbol", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("open", StringType(), True),
    StructField("high", StringType(), True),
    StructField("low", StringType(), True),
    StructField("close", StringType(), True),
    StructField("volume", StringType(), True)
])
# Parse the JSON column ("json_message") and extract the fields
df_parsed = df.withColumn("parsed_price_data", from_json(col("json_message"), json_schema))

# Select individual fields from the parsed JSON column
flattened_df = df_parsed.select(
    col("parsed_price_data.symbol").alias("symbol"),
    col("parsed_price_data.timestamp").alias("timestamp"),
    col("parsed_price_data.open").alias("open"),
    col("parsed_price_data.high").alias("high"),
    col("parsed_price_data.low").alias("low"),
    col("parsed_price_data.close").alias("close"),
    col("parsed_price_data.volume").alias("volume")
)

# Show the flattened DataFrame
#flattened_df.show(truncate=False)
#display(flattened_df)

# Apply cleaning transformations (e.g., filter invalid data)
cleaned_stock_data_df = flattened_df.filter("volume > 0")  # Example of basic cleaning
display(cleaned_stock_data_df)
# Write the cleaned data to the Silver Delta table
cleaned_stock_data_df.write.format("delta").mode("overwrite").saveAsTable("kafka.silver.silver_stock_data")

symbol,timestamp,open,high,low,close,volume
MSFT,2024-10-04 19:59:00,415.88,415.9,415.815,415.815,386
MSFT,2024-10-04 19:58:00,415.86,417.27,415.85,415.88,659
MSFT,2024-10-04 19:57:00,415.87,415.95,415.845,415.8999,1562
MSFT,2024-10-04 19:56:00,415.87,417.51,415.845,415.845,103
MSFT,2024-10-04 19:55:00,415.85,415.92,415.75,415.87,93
MSFT,2024-10-04 19:54:00,415.825,415.92,415.825,415.92,42
MSFT,2024-10-04 19:53:00,415.88,415.88,415.825,415.88,43
MSFT,2024-10-04 19:52:00,415.87,415.9,415.76,415.87,118
MSFT,2024-10-04 19:51:00,415.86,415.9199,415.73,415.825,73
MSFT,2024-10-04 19:50:00,415.82,415.92,415.82,415.92,46
