In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
)

In [None]:
spark = SparkSession.builder.appName("CustomerOrders").getOrCreate()

In [None]:
schema = StructType(
    [
        StructField("user_id", IntegerType(), True),
        StructField("product_id", IntegerType(), True),
        StructField("cost", FloatType(), True),
    ]
)

df = spark.read.schema(schema).csv("./ml-32m/customer-orders.csv")
df.show(5)

+-------+----------+-----+
|user_id|product_id| cost|
+-------+----------+-----+
|     44|      8602|37.19|
|     35|      5368|65.89|
|      2|      3391|40.64|
|     47|      6694|14.98|
|     29|       680|13.08|
+-------+----------+-----+
only showing top 5 rows



In [None]:
# Total spent by customers in decreasing order
df.groupBy("user_id").agg(func.round(func.sum("cost"), 2).alias("total_spent")).orderBy(
    func.desc("total_spent")
).show(5)

+-------+-----------+
|user_id|total_spent|
+-------+-----------+
|     68|    6375.45|
|     73|     6206.2|
|     39|    6193.11|
|     54|    6065.39|
|     71|    5995.66|
+-------+-----------+
only showing top 5 rows



In [None]:
most_spending_customer = (
    df.groupBy("user_id")
    .agg(func.round(func.sum("cost"), 2).alias("total_spent"))
    .orderBy(func.desc("total_spent"))
    .first()
)
print(f"Most spending customer: {most_spending_customer}")

Most spending customer: Row(user_id=68, total_spent=6375.45)
