## Group example

In [1]:
from os import environ
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.types import (StructField, StructType, 
                               IntegerType,
                               FloatType)

In [2]:
spark = SparkSession.builder.appName("customer_orders").getOrCreate()

In [3]:
file_path = "file:///"+environ['DATA_LAKE']
fields = [
        StructField("cust_id", IntegerType(), True),
        StructField("item_id", IntegerType(), True),
        StructField("amount_spent", FloatType(), True),
        ]
final_struct = StructType(fields)

In [4]:
input_df = spark.read.format("csv").schema(final_struct).load(file_path+"customer-orders.csv")
input_df.show()

+-------+-------+------------+
|cust_id|item_id|amount_spent|
+-------+-------+------------+
|     44|   8602|       37.19|
|     35|   5368|       65.89|
|      2|   3391|       40.64|
|     47|   6694|       14.98|
|     29|    680|       13.08|
|     91|   8900|       24.59|
|     70|   3959|       68.68|
|     85|   1733|       28.53|
|     53|   9900|       83.55|
|     14|   1505|        4.32|
|     51|   3378|        19.8|
|     42|   6926|       57.77|
|      2|   4424|       55.77|
|     79|   9291|       33.17|
|     50|   3901|       23.57|
|     20|   6633|        6.49|
|     15|   6148|       65.53|
|     44|   8331|       99.19|
|      5|   3505|       64.18|
|     48|   5539|       32.42|
+-------+-------+------------+
only showing top 20 rows



In [5]:
total_by_customer = input_df.groupBy("cust_id").agg(f.round(f.sum("amount_spent"), 2).alias("total_spent")).sort("total_spent")

In [6]:
total_by_customer.filter(total_by_customer['cust_id']==68).show()

+-------+-----------+
|cust_id|total_spent|
+-------+-----------+
|     68|    6375.45|
+-------+-----------+



In [7]:
spark.stop()