# Pyspark GroupBy and Aggregate function

In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
pizza_dataset_path = "../datasets/pizza_sales/order_details.csv"

In [3]:
spark = SparkSession.builder.appName('Learning Spark').getOrCreate()
spark

In [4]:
data = spark.read.option('header', 'true').csv(pizza_dataset_path, inferSchema=True)

In [5]:
data.show()

+----------------+--------+--------------+--------+
|order_details_id|order_id|      pizza_id|quantity|
+----------------+--------+--------------+--------+
|               1|       1|    hawaiian_m|       1|
|               2|       2| classic_dlx_m|       1|
|               3|       2| five_cheese_l|       1|
|               4|       2|   ital_supr_l|       1|
|               5|       2|    mexicana_m|       1|
|               6|       2|    thai_ckn_l|       1|
|               7|       3|   ital_supr_m|       1|
|               8|       3|  prsc_argla_l|       1|
|               9|       4|   ital_supr_m|       1|
|              10|       5|   ital_supr_m|       1|
|              11|       6|     bbq_ckn_s|       1|
|              12|       6|   the_greek_s|       1|
|              13|       7|spinach_supr_s|       1|
|              14|       8|spinach_supr_s|       1|
|              15|       9| classic_dlx_s|       1|
|              16|       9|green_garden_s|       1|
|           

In [6]:
data.printSchema()

root
 |-- order_details_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- pizza_id: string (nullable = true)
 |-- quantity: integer (nullable = true)



## GroupBy

In [10]:
data.groupBy('pizza_id').sum().show()

+--------------+---------------------+-------------+-------------+
|      pizza_id|sum(order_details_id)|sum(order_id)|sum(quantity)|
+--------------+---------------------+-------------+-------------+
|   ital_supr_s|              4719859|      2077653|          196|
|peppr_salami_s|              8169555|      3596003|          322|
|peppr_salami_l|             16307609|      7177280|          696|
|    hawaiian_l|             22112860|      9734186|          919|
| spinach_fet_s|              9853468|      4337023|          439|
| classic_dlx_l|             11664553|      5133768|          473|
|     bbq_ckn_l|             23030923|     10138539|          992|
|    sicilian_l|             14950783|      6580901|          613|
|peppr_salami_m|             10366140|      4562623|          428|
|spinach_supr_l|              6534365|      2876282|          283|
|   pepperoni_s|             17842438|      7856154|          751|
|   calabrese_s|              2354591|      1036061|          

In [13]:
data.groupBy('pizza_id').count().show()

+--------------+-----+
|      pizza_id|count|
+--------------+-----+
|   ital_supr_s|  194|
|peppr_salami_s|  318|
|peppr_salami_l|  680|
|    hawaiian_l|  896|
| spinach_fet_s|  437|
| classic_dlx_l|  471|
|     bbq_ckn_l|  967|
|    sicilian_l|  596|
|peppr_salami_m|  424|
|spinach_supr_l|  280|
|   pepperoni_s|  739|
|   calabrese_s|   99|
|    mexicana_m|  452|
|     bbq_ckn_s|  479|
|    thai_ckn_s|  476|
|green_garden_m|  300|
| soppressata_s|  288|
| pep_msh_pep_l|  381|
|   ital_supr_l|  735|
| soppressata_m|  268|
+--------------+-----+
only showing top 20 rows



In [14]:
data.groupBy('pizza_id').max().show()

+--------------+---------------------+-------------+-------------+
|      pizza_id|max(order_details_id)|max(order_id)|max(quantity)|
+--------------+---------------------+-------------+-------------+
|   ital_supr_s|                48400|        21251|            2|
|peppr_salami_s|                48614|        21347|            2|
|peppr_salami_l|                48592|        21339|            2|
|    hawaiian_l|                48556|        21322|            3|
| spinach_fet_s|                48578|        21331|            2|
| classic_dlx_l|                48488|        21295|            2|
|     bbq_ckn_l|                48576|        21331|            3|
|    sicilian_l|                48543|        21315|            2|
|peppr_salami_m|                48517|        21303|            2|
|spinach_supr_l|                48404|        21251|            2|
|   pepperoni_s|                48551|        21318|            2|
|   calabrese_s|                47321|        20795|          