# Import Library

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, dayofweek, hour, minute, second, date_format,col, dayofweek, round ,to_timestamp

# Initialise Variables

In [0]:
# Location of the Bronze Delta Table
bronze_dir = "abfss://bronze@tflopendatalogs.dfs.core.windows.net/Crowding_JSON_Logs"

# Location of the Volume in Unity Catalog
location = '/Volumes/tfl_crowding_analysis_we/default/naptancodes/naptan.csv'

# Reading Napton CSV
naptan_df = spark.read.csv(location, header=True, inferSchema=True)

# Renaming the column
naptan_reNamed_df = naptan_df.withColumnRenamed("naptanID", "naptonId")

# Transformation

In [0]:
from pyspark.sql.functions import to_timestamp, col, round, date_format# Reading the Bronze Delta Table

crowd_df = (spark.readStream
                 .format("delta")
                 .load(bronze_dir))

# Joining the two DataFrames
crowd_join_df = crowd_df.join(naptan_reNamed_df, "naptonId" )

# Calculating the percentage of baseline & Day of the Week
crowd_data_metrics_df = (crowd_join_df
                                    .withColumn("percentageOfBaseline", round(col("percentageOfBaseline") * 100, 2))
                                    # .withColumn("month", date_format(to_timestamp(col("timeLocal")), "MMMM"))       # string
                                    # .withColumn("year", date_format(to_timestamp(col("timeLocal")), "yyyy"))           # string
                                    .withColumn("day", date_format(to_timestamp(col("timeLocal")), "EEEE"))        # string
)

# Write to Catalog

In [0]:
# Write to a Delta table in Unity Catalog
query = (crowd_data_metrics_df.writeStream
                            .trigger(processingTime="1 second")
                            .format("delta")                    # Delta format
                            .outputMode("append")               # append mode for streaming
                            .option("checkpointLocation", "/mnt/checkpoints/crowd_data")  # mandatory for streaming
                            .toTable("tfl_crowd.crowd_metrics")  # Unity Catalog table name
)