In [None]:
#author: Adrian J
import pytz
from datetime import datetime
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType, DateType, IntegerType, StringType
from pyspark.sql import Row

In [None]:
#get or create spark delta session
builder = (
    SparkSession
    .builder
    .master("local")
    .appName("p0_stream_triggers")
    .config("spark.cassandra.connection.host", "dockertests-cassandra-1")
    .config("spark.cassandra.auth.username", "cassandra")
    .config("spark.cassandra.auth.password", "cassandra")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#source table in raw zone
source_schema = 'assignment_data'
source_table_name = 'r_session_events'
source_dl_raw_path = f'/home/jovyan/work/data_lake/raw/{source_schema}/{source_table_name}/'

#delta sink table uc zone
sink_dl_schema = 'uc_assignment'
sink_table_name = 'uc_delta_session_events'
sink_dl_uc_path = f'/home/jovyan/work/data_lake/use_case/{sink_dl_schema}/{sink_table_name}'

#assuming finland time is the default time for the data lake ts
FinlandTimeZone = pytz.timezone('Europe/Helsinki')
#all columns from source
source_select_columns = ["country", "player_id", "session_id", "ts"]

In [None]:
#READ START events and WRITE to cassandra
def start_events_write_to_cassandra(microbatch_input, epoch_id):
    (microbatch_input
     .write
     .format("org.apache.spark.sql.cassandra")
     .option("keyspace", "session_events")
     .option("table", "api_start_session_by_hour")
     .mode("append")
     .save())
    #microbatch_input.show(truncate=False)

#START WRITE to cassandra
strm_start_events_append_cassandra = (
    spark
    .readStream
    .format("delta")
    .load(source_dl_raw_path)
    .where(F.col("event")=="start")
    .select(F.col("ts").cast(DateType()).cast(StringType()).alias("event_date"),
            F.col("country"),
            F.lit(datetime.now(FinlandTimeZone).strftime('%Y-%m-%d %H:%M:%S')).cast(TimestampType()).alias("cs_insert_ts"),
            F.col("player_id"),
            F.col("session_id"),
            F.col("ts"))
    .writeStream
    .foreachBatch(start_events_write_to_cassandra)
    #.format("console")
    .outputMode("append")
    .trigger(processingTime="30 seconds")
    .start()
)

In [None]:
#READ END events and WRITE to cassandra
def end_events_write_to_cassandra(microbatch_input, epoch_id):
    (microbatch_input
     .write
     .format("org.apache.spark.sql.cassandra")
     .option("keyspace", "session_events")
     .option("table", "api_completed_sessions")
     .mode("append")
     .save())
    #microbatch_input.show(truncate=False)

#START WRITE to cassandra
strm_start_events_append_cassandra = (
    spark
    .readStream
    .format("delta")
    .load(source_dl_raw_path)
    .where(F.col("event")=="end")
    .select(F.col("ts").cast(DateType()).cast(StringType()).alias("event_date"),
            F.col("country"),
            F.lit(datetime.now(FinlandTimeZone).strftime('%Y-%m-%d %H:%M:%S')).cast(TimestampType()).alias("cs_insert_ts"),
            F.col("player_id"),
            F.col("session_id"),
            F.col("ts"))
    .writeStream
    .foreachBatch(end_events_write_to_cassandra)
    #.format("console")
    .outputMode("append")
    .trigger(processingTime="30 seconds")
    .start()
)

In [None]:
print("INFO: listener to write into cassandra initiated")