In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('/Volumes/workspace/breakable-toy/streamingdata/social_media_engagement_bronze.csv')

df = df.dropDuplicates()

df.writeTo('workspace.`breakable-toy`.social_media_streaming_engagement_bronze').createOrReplace()

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.types import StringType, TimestampType, DateType, DoubleType
from pyspark.sql.functions import first, col, sum, to_date, date_format

aggregated_df = df.groupBy("post_id", "platform").agg(
    first("post_timestamp").alias("post_timestamp"),
    sum("likes").alias("likes"),
    sum("comments").alias("comments"),
    sum("shares").alias("shares")
)

# Extract post_date and post_time AFTER aggregation
aggregated_df = aggregated_df.withColumn("post_date", to_date(col("post_timestamp")))
aggregated_df = aggregated_df.withColumn("post_time", date_format(col("post_timestamp"), "HH:mm:ss"))

# Compute total engagements
aggregated_df = aggregated_df.withColumn("total_engagements",
    col("likes") + col("comments") + col("shares")
)

# Compute reach
aggregated_df = aggregated_df.withColumn("reach",
    col("total_engagements") * 3.25
)

# Compute engagement rate
aggregated_df = aggregated_df.withColumn("engagement_rate",
    (col("total_engagements") / col("reach")) * 100                                         
)

aggregated_df = aggregated_df.select(
    col("post_id").cast(StringType()),
    col("platform").cast(StringType()),
    col("post_timestamp").cast(TimestampType()),
    col("post_date").cast(DateType()),
    col("post_time").cast(StringType()),
    col("likes").cast(DoubleType()),
    col("comments").cast(DoubleType()),
    col("shares").cast(DoubleType()),
    col("reach").cast(DoubleType()),
    col("total_engagements").cast(DoubleType()),
    col("engagement_rate").cast(DoubleType())
)

# Write to Gold Table
aggregated_df.write.mode("append").saveAsTable("workspace.`breakable-toy`.social_media_engagement_gold")