In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName('AGGREGATE_FUNCTIONS').getOrCreate()

In [2]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)

df.show(truncate=False)     

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



### approx_count_distinct
#### collect() - is used to retrieve all the elements of the DataFrame to the driver program
#### collect()[0] -  accesses the first element of the array returned by collect()
#### collect()[0][0] -  is used to extract the value of the first row and first column of the result 


In [10]:
from pyspark.sql.functions import approx_count_distinct

print("approx_count_distinct: " +str(df.select(approx_count_distinct("department")).collect()))
print("approx_count_distinct: " +str(df.select(approx_count_distinct("department")).collect()[0]))
print("approx_count_distinct: " +str(df.select(approx_count_distinct("department")).collect()[0][0]))
print(df)

approx_count_distinct: [Row(approx_count_distinct(department)=3)]
approx_count_distinct: Row(approx_count_distinct(department)=3)
approx_count_distinct: 3
DataFrame[employee_name: string, department: string, salary: bigint]


### Average

In [4]:
from pyspark.sql.functions import avg

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

avg: 3400.0


### collect_list - returns all values from an input column with duplicates.


In [5]:
from pyspark.sql.functions import collect_list

df.select(collect_list("salary")).show(truncate=False)

+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+



### collect_set - returns all values from an input column with duplicate values eliminated.

In [6]:
from pyspark.sql.functions import collect_set

df.select(collect_set("salary")).show(truncate=False)

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



### countDistinct - returns the number of distinct elements in a column

In [7]:
from pyspark.sql.functions import countDistinct

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

Distinct Count of Department & Salary: 8


### count - returns number of elements in a column

In [8]:
from pyspark.sql.functions import count

print("count: "+str(df.select(count("salary")).collect()[0][0]))

count: 10


### grouping - 

In [9]:
from pyspark.sql import functions as F
df.show()

result_rollup = df.rollup("department", "employee_name").agg(
    F.sum("salary").alias("total_salary"),
    F.grouping("department").alias("is_department_grouped"),
    F.grouping("employee_name").alias("is_employee_name_grouped")
)
result_rollup.show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+

+----------+-------------+------------+---------------------+------------------------+
|department|employee_name|total_salary|is_department_grouped|is_employee_name_grouped|
+----------+-------------+------------+---------------------+------------------------+
|     Sales|        James|        6000|                    0|                       0|
|      NULL|         NULL|       34000|                    1|                       1|
|     Sales|         NULL|       18800|                    0|                       1|
|