In [26]:
#import required modules
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [27]:
from pyspark.sql.functions import col, lit, to_timestamp
from datetime import datetime

In [42]:
# Import our SCD implented functions
from scd_component import get_open_and_closed, no_change_or_update, new_rows, deleted_rows,scd

In [29]:
spark = SparkSession.builder.getOrCreate()

In [30]:
%run "./Setup"

In [31]:
# The existing courses
courses_df.show()

+---------+--------------------+
|course_id|        course_title|
+---------+--------------------+
|        1|    Mastering Python|
|        2|Data Engineering ...|
|        3|   Mastering Pyspark|
|        4|      AWS Essentials|
|        5|          Docker 101|
+---------+--------------------+



In [32]:
# Existing courses needs to be updated. So add tracking columns to it and save it as history dataframe
history = (courses_df
           .withColumn("start_date", to_timestamp(lit("2000-01-01 00:01:02"), "yyyy-MM-dd HH:mm:ss"))
           .withColumn("end_date", to_timestamp(lit("2999-12-31 00:00:00"), "yyyy-MM-dd HH:mm:ss"))
           .withColumn("open_reason", lit("new"))
           .withColumn("close_reason", lit(None))
           .withColumn("is_deleted", lit(0))
           )
keys_list = ["course_id"]

In [33]:
history.show()

+---------+--------------------+-------------------+-------------------+-----------+------------+----------+
|course_id|        course_title|         start_date|           end_date|open_reason|close_reason|is_deleted|
+---------+--------------------+-------------------+-------------------+-----------+------------+----------+
|        1|    Mastering Python|2000-01-01 00:01:02|2999-12-31 00:00:00|        new|        null|         0|
|        2|Data Engineering ...|2000-01-01 00:01:02|2999-12-31 00:00:00|        new|        null|         0|
|        3|   Mastering Pyspark|2000-01-01 00:01:02|2999-12-31 00:00:00|        new|        null|         0|
|        4|      AWS Essentials|2000-01-01 00:01:02|2999-12-31 00:00:00|        new|        null|         0|
|        5|          Docker 101|2000-01-01 00:01:02|2999-12-31 00:00:00|        new|        null|         0|
+---------+--------------------+-------------------+-------------------+-----------+------------+----------+



In [34]:
# Get the rows which are active
open_rows,closed_rows = get_open_and_closed(history.drop(col("end_date")))

In [35]:
open_rows.show()

+---------+--------------------+-------------------+-----------+------------+----------+
|course_id|        course_title|         start_date|open_reason|close_reason|is_deleted|
+---------+--------------------+-------------------+-----------+------------+----------+
|        1|    Mastering Python|2000-01-01 00:01:02|        new|        null|         0|
|        2|Data Engineering ...|2000-01-01 00:01:02|        new|        null|         0|
|        3|   Mastering Pyspark|2000-01-01 00:01:02|        new|        null|         0|
|        4|      AWS Essentials|2000-01-01 00:01:02|        new|        null|         0|
|        5|          Docker 101|2000-01-01 00:01:02|        new|        null|         0|
+---------+--------------------+-------------------+-----------+------------+----------+



In [10]:
# Show the rows which are inactive
closed_rows.show()

+---------+------------+----------+-----------+------------+----------+
|course_id|course_title|start_date|open_reason|close_reason|is_deleted|
+---------+------------+----------+-----------+------------+----------+
+---------+------------+----------+-----------+------------+----------+



In [36]:
# The latest courses
current.show()

+---------+--------------------+
|course_id|        course_title|
+---------+--------------------+
|        1|    Mastering Python|
|        2|Data Engineering ...|
|        3|   Mastering Pyspark|
|        5|          Docker 102|
|        6|              DP 100|
+---------+--------------------+



In [37]:
# Get the list of courses which are current (Either not changed or updated). This list excludes deleted records
merged = no_change_or_update(history_open=open_rows,
                                     current=current,
                                     keys_list=keys_list)

In [38]:
merged.show()



+---------+--------------------+-------------------+-------------+-------------+----------+
|course_id|        course_title|         start_date|  open_reason| close_reason|is_deleted|
+---------+--------------------+-------------------+-------------+-------------+----------+
|        3|   Mastering Pyspark|2000-01-01 00:01:02|          new|         null|         0|
|        1|    Mastering Python|2000-01-01 00:01:02|          new|         null|         0|
|        2|Data Engineering ...|2000-01-01 00:01:02|          new|         null|         0|
|        5|          Docker 102|2022-09-11 10:01:48|changed_value|         null|         0|
|        5|          Docker 101|2000-01-01 00:01:02|          new|changed_value|         0|
+---------+--------------------+-------------------+-------------+-------------+----------+



                                                                                

In [39]:
# Get list of deleted records
deleted_rows_df = deleted_rows(history_open=open_rows, current=current, keys_list=keys_list)

In [40]:
deleted_rows_df.show()



+---------+--------------+-------------------+-----------+------------+----------+-------------------+
|course_id|  course_title|         start_date|open_reason|close_reason|is_deleted|           end_date|
+---------+--------------+-------------------+-----------+------------+----------+-------------------+
|        4|AWS Essentials|2000-01-01 00:01:02|        new|     deleted|         0|2022-09-11 10:01:48|
|        4|AWS Essentials|2022-09-11 10:01:48|    deleted|        null|         1|2999-12-31 00:00:00|
+---------+--------------+-------------------+-----------+------------+----------+-------------------+



                                                                                

In [41]:
# Get list of new records
new_rows_df = new_rows(history_open=history, current=current, keys_list=keys_list)
new_rows_df.show()



+---------+------------+-------------------+-----------+------------+----------+-------------------+
|course_id|course_title|           end_date|open_reason|close_reason|is_deleted|         start_date|
+---------+------------+-------------------+-----------+------------+----------+-------------------+
|        6|      DP 100|2999-12-31 00:00:00|        new|        null|         0|2022-09-11 10:01:48|
+---------+------------+-------------------+-----------+------------+----------+-------------------+



                                                                                

In [45]:
scd(history=history, current=current, keys_list=keys_list).sort("course_id").show()



+---------+--------------------+-------------------+-------------------+-------------+-------------+----------+
|course_id|        course_title|         start_date|           end_date|  open_reason| close_reason|is_deleted|
+---------+--------------------+-------------------+-------------------+-------------+-------------+----------+
|        1|    Mastering Python|2000-01-01 00:01:02|2999-12-31 00:00:00|          new|         null|         0|
|        2|Data Engineering ...|2000-01-01 00:01:02|2999-12-31 00:00:00|          new|         null|         0|
|        3|   Mastering Pyspark|2000-01-01 00:01:02|2999-12-31 00:00:00|          new|         null|         0|
|        4|      AWS Essentials|2022-09-11 10:01:48|2999-12-31 00:00:00|      deleted|         null|         1|
|        4|      AWS Essentials|2000-01-01 00:01:02|2022-09-11 10:01:48|          new|      deleted|         0|
|        5|          Docker 101|2000-01-01 00:01:02|2022-09-11 10:01:48|          new|changed_value|    

                                                                                