In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=8829810064f65362440de41c70fb9c8a8d931b88f0369d98191285bfdf4f74f9
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
# Exercise 4: Implementing Incremental Load Pattern using Delta Lake
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

spark = SparkSession.builder.appName("IncrementalLoadExample").getOrCreate()

# Define schema for transactions
schema = StructType([
    StructField("TransactionID", IntegerType(), True),
    StructField("TransactionDate", DateType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Price", IntegerType(), True)
])

# Initial data (first three days)
initial_data = [
    (1, "2024-09-01", "C001", "Laptop", 1, 1200),
    (2, "2024-09-02", "C002", "Tablet", 2, 300),
    (3, "2024-09-03", "C001", "Headphones", 5, 50)
]

initial_df = spark.createDataFrame(initial_data, schema)

# Write initial data to Delta table
initial_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/final_transactions")


In [None]:
# 2. Set Up Incremental Data
incremental_data = [
    (4, "2024-09-04", "C003", "Smartphone", 1, 800),
    (5, "2024-09-05", "C004", "Smartwatch", 3, 200),
    (6, "2024-09-06", "C005", "Keyboard", 4, 100),
    (7, "2024-09-07", "C006", "Mouse", 10, 20)
]

incremental_df = spark.createDataFrame(incremental_data, schema)

# Append the new transactions to the Delta table
incremental_df.write.format("delta").mode("append").save("/content/sample_data/delta/final_transactions")


In [None]:
# 3. Implement Incremental Load
# Read new transactions only (after 2024-09-03)
latest_date = "2024-09-03"
new_transactions_df = incremental_df.filter(f"TransactionDate > '{latest_date}'")

# Append new transactions to the Delta table
new_transactions_df.write.format("delta").mode("append").save("/content/sample_data/delta/final_transactions")


In [None]:
# 4. Monitor Incremental Load
# View the Delta table history
history_df = spark.sql("DESCRIBE HISTORY delta.`/content/sample_data/delta/final_transactions`")
history_df.show(truncate=False)

# Verify the contents of the Delta table
final_df = spark.read.format("delta").load("/content/sample_data/delta/final_transactions")
final_df.show()
