In [3]:
from pyspark.sql import DataFrame, SparkSession

spark: SparkSession = (
    SparkSession.builder.master("local[3]")  # type: ignore
    .appName("Aggregate Example")
    .getOrCreate()
)

spark.version

25/04/08 11:46:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.5.5'

In [6]:
invoice_df: DataFrame = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("data/invoices.csv")
)

                                                                                

# Simple Aggregations

summarize whole dataframe to get 1 single row in the result.

> these aggregates are functions, so we can use them in column object expression or SQL like string expression.


In [13]:
from pyspark.sql import functions as F

invoice_df.select(
    count("*").alias("Total Count"),
    F.sum("Quantity").alias("Total Quantity"),
    F.avg("UnitPrice").alias("Average Unit Price"),
    F.count_distinct("InvoiceNo").alias("Distinct Invoice Count"),
).show()



+-----------+--------------+------------------+----------------------+
|Total Count|Total Quantity|Average Unit Price|Distinct Invoice Count|
+-----------+--------------+------------------+----------------------+
|     541909|       5176450| 4.611113626088481|                 25900|
+-----------+--------------+------------------+----------------------+



                                                                                

In [14]:
invoice_df.selectExpr(
    "COUNT(1) AS `Total Count`",  # count even null values, COUNT(*)
    "COUNT(StockCode) AS `Total Stock Count`",  # count non-null values
    "SUM(Quantity) AS `Total Quantity`",
    "AVG(UnitPrice) AS `Average Unit Price`",
).show()

[Stage 16:>                                                         (0 + 3) / 3]

+-----------+-----------------+--------------+------------------+
|Total Count|Total Stock Count|Total Quantity|Average Unit Price|
+-----------+-----------------+--------------+------------------+
|     541909|           541908|       5176450| 4.611113626086849|
+-----------+-----------------+--------------+------------------+



                                                                                

## Exercise 1

- based on this [_dataset_](code/01-Aggregate/data/invoices.csv)

| Country        | InvoiceNo | TotalQuantity | InvoiceValue |
|----------------|-----------|---------------|--------------|
| United Kingdom | 536446    | 329           | 440.89       |
| United Kingdom | 536508    | 216           | 155.52       |
| United Kingdom | 537018    | -3            | 0.0          |
| United Kingdom | 537401    | -24           | 0.0          |
| United Kingdom | 537811    | 74            | 268.86       |

- group by `Country` and `InvoiceNo`
- `InvoiceValue` is sum of [`Quantity` * `UnitPrice`]

In [15]:
# using spark.sql
invoice_df.createOrReplaceTempView("invoices")

exercise1 = spark.sql(
    """
    SELECT
        Country,
        InvoiceNo,
        SUM(Quantity) AS TotalQuantity,
        ROUND(SUM(Quantity * UnitPrice), 2) AS InvoiceValue
    FROM
        invoices
    GROUP BY
        Country,
        InvoiceNo
    """
)

exercise1.show(5)



+--------------+---------+-------------+------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|
+--------------+---------+-------------+------------+
|United Kingdom|   536446|          329|      440.89|
|United Kingdom|   536508|          216|      155.52|
|United Kingdom|   537018|           -3|         0.0|
|United Kingdom|   537401|          -24|         0.0|
|United Kingdom|   537811|           74|      268.86|
+--------------+---------+-------------+------------+
only showing top 5 rows



                                                                                

In [17]:
# OR using pyspark.sql.functions

# agg(*exprs: Column) -> DataFrame
# Compute aggregates and returns the result as a `DataFrame`.

exercise1_alt = invoice_df.groupBy("Country", "InvoiceNo").agg(
    sum("Quantity").alias("TotalQuantity"),
    F.expr("ROUND(SUM(Quantity * UnitPrice), 2)").alias("InvoiceValue"),
    # OR
    F.round(F.sum(F.expr("Quantity * UnitPrice")), 2).alias("InvoiceValueExpr"),
)

exercise1_alt.show(5)



+--------------+---------+-------------+------------+----------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|InvoiceValueExpr|
+--------------+---------+-------------+------------+----------------+
|United Kingdom|   536446|          329|      440.89|          440.89|
|United Kingdom|   536508|          216|      155.52|          155.52|
|United Kingdom|   537018|           -3|         0.0|             0.0|
|United Kingdom|   537401|          -24|         0.0|             0.0|
|United Kingdom|   537811|           74|      268.86|          268.86|
+--------------+---------+-------------+------------+----------------+
only showing top 5 rows



                                                                                