In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=121c13993b5f9d06419f75204d8e32e1bfb7dd6df3f587e2ad79086fbb197eea
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
# Exercise 1: Creating a Complete ETL Pipeline using Delta Live Tables(DLT)
# Step 1: Ingest Raw Data from CSV Files
import dlt

@dlt.table
def raw_transactions():
    """Ingest raw data from the CSV file."""
    return spark.read.csv("/content/sample_data/transactions.csv", header=True, inferSchema=True)

# Step 2: Apply Transformations
from pyspark.sql.functions import col

@dlt.table
def transformed_transactions():
    """Transform data by calculating the TotalAmount."""
    return (
        dlt.read("raw_transactions")
        .withColumn("TotalAmount", col("Quantity") * col("Price"))
    )
# Step 3: Write the Final Data into a Delta Table
@dlt.table
def final_transactions():
    """Write the final data into a Delta table."""
    return dlt.read("transformed_transactions")


In [None]:
# Write DLT in Python
import dlt
from pyspark.sql.functions import col

@dlt.table
def raw_transactions():
    # Step 1: Read data from the CSV file
    return spark.read.csv("/content/sample_data/transactions.csv", header=True, inferSchema=True)

@dlt.table
def transformed_transactions():
    # Step 2: Apply transformations to calculate total transaction amount
    return (
        dlt.read("raw_transactions")
        .select(
            col("TransactionID"),
            col("TransactionDate"),
            col("CustomerID"),
            col("Product"),
            col("Quantity"),
            col("Price"),
            (col("Quantity") * col("Price")).alias("TotalAmount")  # Calculate total amount
        )
    )


In [None]:
'''
-- Step 1: Create Raw Transactions Table
CREATE OR REFRESH LIVE TABLE raw_transactions AS
SELECT *
FROM read_csv('/content/sample_data/transactions.csv', header = true);

-- Step 2: Create Transformed Transactions Table
CREATE OR REFRESH LIVE TABLE transformed_transactions AS
SELECT
    TransactionID,
    TransactionDate,
    CustomerID,
    Product,
    Quantity,
    Price,
    Quantity * Price AS TotalAmount  -- Calculate total amount
FROM
    LIVE.raw_transactions;
'''

In [None]:
# 4. Monitor the Pipeline
# 1. Access the DLT UI:

#   Open the Databricks workspace.

#   Click on "Workflows" in the sidebar.

#   Select "Delta Live Tables."

# 2. View the Pipeline:

#   A list of DLT pipelines will be displayed. Click on the desired pipeline to view its details.

#   The status of each table can be checked, along with any errors or warnings.

# 3. Examine Execution Details:

#   Execution history, logs, and performance metrics for each step in the pipeline can be reviewed.