# <font color=Blue>Aggregate Functions</font>

* Aggregate functions operate on a group of rows and returns a single value

In [None]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.show(truncate=False)

In [None]:
# Output
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+

### 1) avg()

In [None]:
res_df = df.select(avg("salary"))
res_df.show()

In [None]:
Prints avg: 3400.0

### 2) collecct_list()

* collect_list() returns all values from an input column **with duplicates**

In [None]:
res_df = df.select(collect_list("salary"))
res_df.show(truncate=False)

In [None]:
+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

### 3) collecct_set()

* collect_set() returns all values from an input column **without duplicates**

In [None]:
res_df = df.select(collect_set())
res_df.show(truncate=False)

In [None]:
+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+

### 4) countDistinct()

* countDistinct() retruns returns number of distinct elements in a columns

In [None]:
res_df = df.select(countDistinct("department", "salary"))
res_df.show()

### 5) count()

* count() returns number of elements in a column

In [None]:
res_df = df.select(count("salary"))
res_df.collect()
# 10

### 6) first()

* first() function returns first element in a column. It returns first non-null element.

In [None]:
res_df = df.select(first("salary")).show()
# 3000

### 7) last()

* last() function returns last element in a column. It retuns last non-null element.

In [None]:
res_df = df.select(last("salary")).show()
# 4100

### 8) max()

In [None]:
res_df = df.select(max("salary")).show()
# returns max value of the column
# 4600

### 9) min()

In [None]:
res_df = df.select(min("salary"))
# retunrs min value of the column
# 2000

### 10) mean()

In [None]:
res_df = df.select(mean("salary"))
# returns average of the values of a column
# 3400

### 11) sum()

* returns sum of all the values in a column

In [None]:
res_df = df.select(sum("salary"))
# 34,000

### 12) sumDistinct()

* Returns sum of all distinct values in a column

In [None]:
res_df = df.select(sumDistinct("salary"))
# 21,900

### 13) stddev() *or* stddev_samp()

* stddev() and stddev_samp() returns sample standard deviation of values in a column

In [None]:
res_df = df.select(stddev("salary"), stddev_samp("salary"))
res_df.show()

### 14) stddev_pop()

* stddev_pop() returns population standard deviation of values in a column

In [None]:
res_df = df.select(stddev_pop("salary"))

### 15) variance() *or* var_samp()

* returns unbiased variance of the values in a column

In [None]:
res_df = df.select(variance("salary"), var_samp("salary"))

### 16) var_pop()

* returns population variance of the values in a column

In [None]:
res_df = df.select(var_pop("salary"))