**Setting up Hadoop and Pyspark **

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [3]:
from google.colab import drive
drive.mount('/content/drive')
!pip install pyspark


  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=f49af4ede8f989bec482892df4ec426210072cefc4970900e42080eee11740e0
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
SCD 1 EXAMPLE CODE : Strategy: Update existing records and insert new records

**First Create a Target Table** 

`create external table dev_db.employee(emp_id int, emp_name string, email_id string, state string) stored as ORC location 'hdfs_path';`


In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("SCD1_DEMO").getOrCreate()

# load_date value will be '2021-07-01' when the first time you run
# You can pass load_date to spark program through command line arguments
# load_date = sys.argv[1] #2021-07-01
# for testing I have hardcoded this value
load_date = '2021-07-01'
target_table = 'employee'

# we cannot use Hive in colab that's why I am saving this dataframe as a table here...
# Assume you have created a hive table using 'create table ' command
df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/emp_data_{}.csv".format(load_date), header=True)
df.createOrReplaceTempView('target_table_view')
spark.sql("create table if not exists {} like target_table_view".format(target_table))
spark.read.table(target_table).printSchema()


root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- email_id: string (nullable = true)
 |-- state: string (nullable = true)



In [36]:
#below 4 lines are for testing purpose as on next run data will already exist in colab
#insert overwrite is not supported in colab for same table..
#so here I am creating an empty table
spark.sql("drop table if exists employee_temp")
spark.sql("create table employee_temp like employee")
spark.sql("drop table if exists employee")
spark.sql("alter table employee_temp rename to employee")

def load_data(load_date):
  print("Performing data load for '{}'".format(load_date))
  
  target_table = 'employee'

  # step 1 -- read the current date data from file 
  df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/emp_data_{}.csv".format(load_date), header=True)
  
  # step 2 -- read target table data 
  target_df = spark.read.table(target_table)
  
  #target_df.show()

  # for the first load target_df will not have any records 

  ''' 
  step 3 -- join target df with current df  and identify which records are changed 
  and update values of those columns
  '''

  df = df.select(col('emp_id').alias('emp_id_new'), col('emp_name').alias('emp_name_new'),
                col('email_id').alias('email_id_new'), col('state').alias('state_new'))

  #df.show(50)

  merged_df = target_df.join(df, col('emp_id')==col('emp_id_new'), 'full')\
              .withColumn('emp_id', when(col('emp_id_new').isNotNull(), col('emp_id_new')).otherwise(col('emp_id')))\
              .withColumn('emp_name', when(col('emp_name_new').isNotNull(), col('emp_name_new')).otherwise(col('emp_name')))\
              .withColumn('email_id', when(col('email_id_new').isNotNull(), col('email_id_new')).otherwise(col('email_id')))\
              .withColumn('state', when(col('state_new').isNotNull(), col('state_new')).otherwise(col('state')))\
              .select('emp_id', 'emp_name', 'email_id', 'state')

  #merged_df.show(50)

  # step 4 -- write merged df to target table (not supported in colab)
  # merged_df.write.mode('overwrite').saveAsTable(target_table)

  #below three lines are for testing purpose as overwrite to same table is not supported in colab
  temp_table = 'employee_temp'
  merged_df.write.mode('overwrite').saveAsTable(temp_table)
  spark.sql("insert overwrite table {} select * from {}".format(target_table, temp_table))

  print("Data loaded for '{}'".format(load_date))
  print("Target Table")
  spark.read.table(target_table).show(100)

load_data('2021-07-01')
load_data('2021-07-02')

Performing data load for '2021-07-01'
+------+--------+--------+-----+
|emp_id|emp_name|email_id|state|
+------+--------+--------+-----+
+------+--------+--------+-----+

Data loaded for '2021-07-01'
Target Table
+------+--------------------+--------------------+-----+
|emp_id|            emp_name|            email_id|state|
+------+--------------------+--------------------+-----+
|     2|Shiela Altenwerth...|ialtenwerth@rolfs...|   GA|
|    20|Dr. Kurt Murazik DDS|lucacummerata@run...|   NH|
|     6|Dr. Drusilla Olso...|concepcion18@hotm...|   NE|
|     8|Mr. Maximo Bayer DDS|johnsonbelva@yaho...|   DC|
|     9|  Doctor Considine I|jamiereynolds@bar...|   NH|
|    12|        Adison Lemke|jacquelinestanton...|   WV|
|    15| Windell Cruickshank|hhodkiewicz@lemke...|   CO|
|    18|         Loni Senger|clueilwitz@muelle...|   MO|
|     1|       Denis Hagenes|leroy83@runolfsdo...|   AS|
|     7| Dr. Cade Shields MD|clevie31@hotmail.com|   CT|
|    16|Miss Michal Carte...|    elam85@gmail.