In [0]:

# Read CSV with header and infer schema
txns_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/txns") \
	.toDF("txnid","txndate","custid","amount","prodcategory","prodname","city","state","paymenttype")

txns_df.display()

### withColumn() - Add a new column or modify an existing column.

In [0]:
from pyspark.sql.functions import col, round
# Add a new column with 10% tax on amount
txns1 = txns_df.withColumn("tax", round(col("amount") * 0.10, 2))
txns1.display()

In [0]:
# select *, round(amount * 0.2) as tax, 'USA' as region from txns
from pyspark.sql.functions import lit
txns2 = txns1.withColumn("region",lit("USA"))
txns2.printSchema()
txns2.display()

In [0]:
# Modify an existing column (uppercase product category)
from pyspark.sql.functions import upper
txns_df = txns_df.withColumn("prodcategory", upper(col("prodcategory")))
txns_df.select("txnid","prodcategory").limit(5).display()

### withColumnRenamed() - Rename an existing column.

In [0]:
# Rename 'custid' to 'customer_id'

txns_df = txns_df.withColumnRenamed("custid", "customer_id")
txns_df.select("txnid","customer_id").limit(5).display()

### drop() - Remove one or more columns from a DataFrame.

In [0]:
# Drop the 'tax' column
txns3 = txns2.drop("tax")
txns3.limit(5).display()

In [0]:
# Drop multiple columns
txns3 = txns2.drop("tax", "region")
txns3.limit(5).display()

### na (handling missing/null values) - Handle missing values using drop, fill, or replace.

In [0]:
# Drop rows where any column is null
txns_df.na.drop().show(5)

In [0]:
# Drop rows where 'amount' is null
txns_df.na.drop(subset=["amount"]).limit(5).display()

In [0]:
# Fill nulls in 'city' with 'Unknown'
txns_df.na.fill({"city": "Unknown"}).limit(5).display()

In [0]:
# Fill nulls in 'city' with 'Unknown' and amount with 0.0
txns_df.na.fill({"city": "Unknown","amount":0.0}).limit(5).display()

In [0]:
# Replace values in 'state'
txns_df.na.replace({"California":"CA","Long Beach":"LB"}, subset=["state","city"]).limit(5).display()

### selectExpr() - Select columns using SQL expressions

In [0]:
# Select columns with alias
txns_df.selectExpr("txnid", "customer_id as custid", "amount").limit(5).display()

# Perform arithmetic calculation
txns_df.selectExpr("txnid", "amount", "round(amount * 0.10,2) as tax").limit(5).display()

# Use string functions
txns_df.selectExpr("upper(prodcategory) as prodcategory_upper").limit(5).display()