In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=4db2ef1ff8e97f6c182b69c84567e886e67940d41457f2e7e0135bb1434c5d74
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
# Exercise 2: Delta Lake Operations - Read, Write, Update, Delete, Merge
# 1: Read Data from Delta Lake
# Read the transactional data from the Delta table
transactions_df = spark.read.format("delta").load("/content/sample_data/delta/final_transactions")

# Display the first 5 rows
transactions_df.show(5)

In [None]:
# 2: Write Data to Delta Lake
# Define new transactions to append
new_transactions = [
    (6, "2024-09-06", "C005", "Keyboard", 4, 100),
    (7, "2024-09-07", "C006", "Mouse", 10, 20)
]

# Create a DataFrame for new transactions
new_transactions_df = spark.createDataFrame(new_transactions, ["TransactionID", "TransactionDate", "CustomerID", "Product", "Quantity", "Price"])

# Append new transactions to the Delta table
new_transactions_df.write.format("delta").mode("append").save("/content/sample_data/delta/final_transactions")


In [None]:
# 3: Update Data in Delta Lake
from delta.tables import *

# Load the Delta table
delta_table = DeltaTable.forPath(spark, "/content/sample_data/delta/final_transactions")

# Update the Price of Product 'Laptop'
delta_table.update(
    condition="Product = 'Laptop'",
    set={"Price": "1300"}
)

# Verify the update
transactions_df = spark.read.format("delta").load("/content/sample_data/delta/final_transactions")
transactions_df.filter("Product = 'Laptop'").show()


In [None]:
# 4: Delete Data from Delta Lake
# Delete all transactions where Quantity is less than 3
delta_table.delete("Quantity < 3")

# Verify the deletion
transactions_df = spark.read.format("delta").load("/content/sample_data/delta/final_transactions")
transactions_df.show()

In [None]:
# Task 5: Merge Data into Delta Lake
# Create new data for merging
merge_data = [
    (1, "2024-09-01", "C001", "Laptop", 1, 1250),  # Updated Price
    (8, "2024-09-08", "C007", "Charger", 2, 30)    # New Transaction
]

# Create a DataFrame for the merge data
merge_df = spark.createDataFrame(merge_data, ["TransactionID", "TransactionDate", "CustomerID", "Product", "Quantity", "Price"])

# Perform the merge operation
delta_table.alias("t").merge(
    merge_df.alias("s"),
    "t.TransactionID = s.TransactionID"
).whenMatchedUpdate(set={
    "Price": "s.Price",
    "Quantity": "s.Quantity",
    "TransactionDate": "s.TransactionDate",
    "CustomerID": "s.CustomerID",
    "Product": "s.Product"
}).whenNotMatchedInsert(values={
    "TransactionID": "s.TransactionID",
    "TransactionDate": "s.TransactionDate",
    "CustomerID": "s.CustomerID",
    "Product": "s.Product",
    "Quantity": "s.Quantity",
    "Price": "s.Price"
}).execute()
