In [0]:
from pyspark.sql.functions import col

# 1. Read CSV file with header and infer schema
txns_df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/txns")
    .toDF("txnid", "txndate", "custid", "amount", "prodcategory", "prodname", "city", "state", "paymenttype")
)

In [0]:
# Filter transactions with amount > 100 using DataFrame API
filtered_df = txns_df.filter(col("amount") > 100)

In [0]:
# Add a tax column (10% of amount)
filtered_df = filtered_df.withColumn("tax", col("amount") * 0.10)


In [0]:
# Register filtered DataFrame as a temporary SQL view
filtered_df.createOrReplaceTempView("filtered_txns_view")

In [0]:
# Aggregate using SQL
agg_df = spark.sql("""
    SELECT
        UPPER(state) as State,
        COUNT(txnid) AS txn_count,
        ROUND(SUM(amount),2) AS total_amount,
        ROUND(SUM(tax),2) AS total_tax,
        current_date() as data_dt,
        current_timestamp() as process_ts
    FROM filtered_txns_view
    GROUP BY state
    ORDER BY total_amount DESC
""")
agg_df.display()

In [0]:
# Filter aggregated results using DataFrame API
agg_df.filter(col("total_amount") > 500).display()