### Pyspark GroupBy And Aggregate Functions

In [1]:
# Connecting the google colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Get the path for the 'PySpark_test3.csv' file as file_path
file_path = '/content/drive/MyDrive/Datasets/PySpark_test3.csv'

In [3]:
# Importing pyspark
import pyspark

In [4]:
# Importing SparkSession and create the Spark context o0bject as spark with 'AggreGroup' app name
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('AggreGroup').getOrCreate()

In [5]:
# Check the details for the spark context object
spark

In [7]:
# Reading the 'PySpark_test3.csv' file as df_org with first row as header and inferSchema set as true
df_org = spark.read.csv(file_path, header = True, inferSchema = True)
df_org.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [8]:
# Get the Schema of the data frame
df_org.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [9]:
# Get the total salary of people, grouped by their names
df_org.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [10]:
# Find the average salary of each people grouped by their names
df_org.groupBy('Name').avg().show()

+---------+------------------+
|     Name|       avg(salary)|
+---------+------------------+
|Sudhanshu|11666.666666666666|
|    Sunny|            6000.0|
|    Krish| 6333.333333333333|
|   Mahesh|            3500.0|
+---------+------------------+



In [11]:
# Groupby Departmernts to know which department gives highest salaries
df_org.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [12]:
# Get the average salary of departments grouped by the departments
df_org.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [13]:
# Count the number of appearances of each department in the data set
df_org.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [14]:
# Find the total amount of salary given to the peoples from the data set
df_org.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [15]:
# Find the person getting the maximum salary across all this departments
df_org.groupBy('Name').max().show()

+---------+-----------+
|     Name|max(salary)|
+---------+-----------+
|Sudhanshu|      20000|
|    Sunny|      10000|
|    Krish|      10000|
|   Mahesh|       4000|
+---------+-----------+

