In [0]:
# Read CSV with header and infer schema
txns_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/txns") \
	.toDF("txnid","txndate","custid","amount","prodcategory","prodname","city","state","paymenttype")

txns_df.display()

### groupBy() - Groups the DataFrame by one or more columns.

In [0]:
# Total transaction amount per state
# select state, round(sum(amount),2) as total from txns_df group by state order by total desc
# select state, sum(amount) from txns_df group by state

from pyspark.sql.functions import round
txns_df.groupBy("state").sum("amount") \
    .selectExpr("state","round(`sum(amount)`,2) as total") \
    .orderBy("total", ascending=False) \
    .display()

### agg() - Perform one or more aggregate functions on grouped DataFrame.

In [0]:
# select state,round(sum(amount),2) as total_amount,round(avg(amount),2) as avg_amount,count(txnid) as txn_count from txns_df group by state limit 5
from pyspark.sql.functions import avg,sum,count,round
# Total and average amount per state
txns_df.groupBy("state").agg(
    round(sum("amount"),2).alias("total_amount"),
    round(avg("amount"),2).alias("avg_amount"),
    count("txnid").alias("txn_count")
).limit(5).display()


In [0]:
txns_df.groupBy("state","city").agg(
    round(sum("amount"),2).alias("total_amount"),
    round(avg("amount"),2).alias("avg_amount"),
    count("txnid").alias("txn_count")
).orderBy("state","city").limit(50).display()

In [0]:
#select state, min(amount) as min_salesamt, max(amount) as max_salesamt, count(txnid) as txn_count from txns_df group by state order by state limit 50
from pyspark.sql.functions import min,max

txns_df.groupBy("state").agg(
    round(min("amount"),2).alias("min_salesamt"),
    round(max("amount"),2).alias("max_salesamt")
).orderBy("state").limit(50).display()

### alias() - Assign a temporary name to a column or expression (useful in select or agg).

In [0]:
from pyspark.sql.functions import sum,col

# Alias in aggregation
txns_df.groupBy("state").agg(
    sum("amount").alias("total_amount")
).limit(5).display()

# Alias in select
txns_df.select(col("amount").alias("txn_amount"), "state").limit(5).display()

# Alias can also be chained with expressions

from pyspark.sql.functions import round

# Round total amount and give alias
txns_df.groupBy("state").agg(
    round(sum("amount"),2).alias("total_amount_rounded")
).limit(5).display()


### Combining groupBy, agg, and alias

In [0]:
txns_df.groupBy("city").agg(
    sum("amount").alias("total_amount"),
    avg("amount").alias("avg_amount"),
    max("amount").alias("max_amount"),
    min("amount").alias("min_amount"),
    count("txnid").alias("txn_count")
).orderBy(col("total_amount").desc()).limit(5).display()