In [None]:
#Notebook to read the raw assignment file and transform into the normalized tables to be uploaded into the cassandra db
#Author: Adrian Jimenez 2022-05
#----
#To-do: add dynamic creation of target path for multiple loads
#To-do: encapsulate the spark session for a potential caller to create and close the session
import os
import pytz
from datetime import datetime
from delta import * #to-do remove the wild import from delta

from pyspark.sql import functions as F
from pyspark.sql import SparkSession 
from pyspark.sql.types import DateType, StructType, StructField, StringType, TimestampType

from etl_utils.table_schema import TableSchema
from etl_utils.general_utils import create_table_in_metastore, persist_table, upsert_into_table

debug = True

In [None]:
#get or create spark delta session
builder = (
    SparkSession
    .builder
    .master("local")
    .appName("p2_lz_to_raw")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#run variables
#to-do this should be variables fetched by the caller or stored in a configuration table in sql 
table_name = 'r_session_events'
schema_root_path = '/home/jovyan/work/data_lake/raw/schemas'
player_session_schema = TableSchema(table_name, schema_root_path).load_schema_json()

In [None]:
#read using saved schema and unzip raw file
df_assignment_data = (
    spark
    .read
    .format('json')
    .schema(player_session_schema)
    .load('/home/jovyan/work/data_lake/landingzone/assignment_data/assignment_data.jsonl.bz2')
)

#add data lake ts for insert and update
FinlandTimeZone = pytz.timezone('Europe/Helsinki')##Assuming the correct zone is helsinki, otherwise it can be adjust to UTC as standard
df_assignment_data = (
    df_assignment_data
    .select(
        F.col("country"),
        F.col("event"),
        F.col("player_id"),
        F.col("session_id"),
        F.col("ts"),
        F.lit(datetime.now(FinlandTimeZone).strftime('%Y-%m-%d %H:%M:%S')).cast(TimestampType()).alias("DL_INSERT_TS"),
        F.lit(datetime.now(FinlandTimeZone).strftime('%Y-%m-%d %H:%M:%S')).cast(TimestampType()).alias("DL_UPDATE_TS"),
        F.col("ts").cast(DateType()).alias("EVENT_DATE"),
        F.concat_ws("-", F.col("event"), F.col("session_id")).alias("UNIQUE_ID_ROW")
    )
)
print("INFO: assignment data loaded in dataframe")

In [None]:
#work around to mimic a stream of events
inputDF = df_assignment_data
partition_key = "EVENT_DATE"
dl_insert_ts_column = "DL_INSERT_TS"
unique_id = "UNIQUE_ID_ROW"
upsert_mode = "append"
schema = 'assignment_data'
table_name = 'r_session_events'
dl_raw_path = f'/home/jovyan/work/data_lake/raw/{schema}/{table_name}/'
#cache for faster insert
inputDF.cache()    
list_events_dates = inputDF.select(F.col("EVENT_DATE").cast(StringType())).distinct().sort(F.col("EVENT_DATE").asc()).rdd.flatMap(lambda x: x).collect()
#for development purposes
if debug:
    list_events_dates = list_events_dates[4:7]
    
for d in list_events_dates:
    #adding unique id to enable smaller batches
    df_to_write = (inputDF.where(F.col(f"{partition_key}")==d).sort(F.col("ts")))
    if not os.path.exists(dl_raw_path):
        persist_table(spark, df_to_write, dl_raw_path, partition_key, table_name)
    else:
        #if exists follow the upsert pattern
        #to-do find the correct column for the upsert
        upsert_into_table(spark, df_to_write, table_name, schema, dl_raw_path, unique_id,  dl_insert_ts_column, upsert_mode)
inputDF.unpersist()