### Read the CSV file into a Spark DataFrame

In [0]:
from pyspark.sql.functions import col

# Read CSV with header and infer schema
txns_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/txns") \
	.toDF("txnid","txndate","custid","amount","prodcategory","prodname","city","state","paymenttype")

txns_df.display()

### select – Choose specific columns

In [0]:
# Select only txnid, custid, amount, city
txns_df.select("txnid", "custid", "amount", "city").display()

# Or using expressions
txns_df.select(col("txnid"), col("amount"), col("prodcategory")).show(5)

### Using filter() - functional-like style

In [0]:
### Using col() from pyspark.sql.functions
high_txns = txns_df.filter(col("amount") > 150)
high_txns.display()

In [0]:
# Amount > 150
txns_df.filter(txns_df["amount"] > 150).display()

#Using DataFrame attribute access (df.colname)
# Amount > 150
txns_df.filter(txns_df.amount > 150).display()

# Multiple conditions
txns_df.filter((txns_df.amount > 150) & (txns_df.state == "California")).display()

# & - and operator
# | - or operator
# ~ - not operator
# != - not equal operator
# == - equal operator
# < - less than operator 
# > - greater than operator
# <= - less than or equal to operator
# >= - greater than or equal to operator

### Using SQL expression strings

In [0]:
# Amount > 150
txns_df.filter("amount > 150").display()

# Multiple conditions
txns_df.filter("amount > 150 AND state='California'").display()

### Using where() - SQL-like style

In [0]:
# Using SQL-like where
high_txns2 = txns_df.where(col("amount") > 150)
high_txns2.display()

# Multiple conditions
high_txns_ca2 = txns_df.where((col("amount") > 150) & (col("state") == "California"))
high_txns_ca2.display()


### Using SQL expression string

#Both filter and where can take a string expression, just like in SQL:

# Using string expression with filter
txns_df.filter("amount > 150 AND state='California'").display()

# Using string expression with where
txns_df.where("amount > 150 AND state='California'").display()