# **Setup**

In [69]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('DemoSession01').getOrCreate()
spark

# **Create DataFrame**

In [70]:
spark.range(1000).toDF('number').show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



# **Read CSV File**

In [71]:
df_pyspark = spark.read.csv('tips.csv',header=True,inferSchema=True)

print(type(df_pyspark))
df_pyspark.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3

## **Head Function**

In [72]:
df_pyspark.head(6)

[Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4),
 Row(total_bill=25.29, tip=4.71, sex='Male', smoker='No', day='Sun', time='Dinner', size=4)]

## **Get Schema**

In [73]:
# Get Dataframe Schema/Info
df_pyspark.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



## **Get Dtypes**

In [74]:
df_pyspark.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'int')]

## **Get Column Names**

In [75]:
df_pyspark.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

## **Describe Data**

In [76]:
df_pyspark.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950824|2.9982786885245902|  NULL|  NULL|NULL|  NULL| 2.569672131147541|
| stddev| 8.902411954856857|1.3836381890011815|  NULL|  NULL|NULL|  NULL|0.9510998047322347|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



# **Working With Columns**

In [77]:
df_pyspark.show(10)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 10 rows



## **Select Columns**

In [78]:
df_pyspark['total_bill']

Column<'total_bill'>

In [79]:
df_pyspark.select(['total_bill','tip','time']).show(10)

# Output Of Select Function is a PySpark Dataframe 

+----------+----+------+
|total_bill| tip|  time|
+----------+----+------+
|     16.99|1.01|Dinner|
|     10.34|1.66|Dinner|
|     21.01| 3.5|Dinner|
|     23.68|3.31|Dinner|
|     24.59|3.61|Dinner|
|     25.29|4.71|Dinner|
|      8.77| 2.0|Dinner|
|     26.88|3.12|Dinner|
|     15.04|1.96|Dinner|
|     14.78|3.23|Dinner|
+----------+----+------+
only showing top 10 rows



## **Add New Column**

In [80]:
# Current Dataframe
df_pyspark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [81]:
# Adding New Column With Constant Value
from pyspark.sql.functions import lit

df_pyspark = df_pyspark.withColumn('Status',lit('Yes'))
df_pyspark.show()

+----------+----+------+------+---+------+----+------+
|total_bill| tip|   sex|smoker|day|  time|size|Status|
+----------+----+------+------+---+------+----+------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   Yes|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   Yes|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   Yes|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   Yes|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   Yes|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|   Yes|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|   Yes|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|   Yes|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|   Yes|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|   Yes|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|   Yes|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|   Yes|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|   Yes|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|   Yes|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|   Yes|
|     21.5

In [82]:
# Add New Column With Concatenation Of Values In Existing Columns
from pyspark.sql.functions import concat, col

df_pyspark = df_pyspark.withColumn("daytime", concat(col("day"),lit('-'),col("time")))
df_pyspark.show(5)

+----------+----+------+------+---+------+----+------+----------+
|total_bill| tip|   sex|smoker|day|  time|size|Status|   daytime|
+----------+----+------+------+---+------+----+------+----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
+----------+----+------+------+---+------+----+------+----------+
only showing top 5 rows



## **Drop Columns**

In [83]:
df_pyspark.drop('New_Column').show()

+----------+----+------+------+---+------+----+------+----------+
|total_bill| tip|   sex|smoker|day|  time|size|Status|   daytime|
+----------+----+------+------+---+------+----+------+----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|     15.4

## **Rename Columns**

In [84]:
df_pyspark.withColumnRenamed("total_bill", "bill").show()

+-----+----+------+------+---+------+----+------+----------+
| bill| tip|   sex|smoker|day|  time|size|Status|   daytime|
+-----+----+------+------+---+------+----+------+----------+
|16.99|1.01|Female|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|10.34|1.66|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|21.01| 3.5|  Male|    No|Sun|Dinner|   3|   Yes|Sun-Dinner|
|23.68|3.31|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|24.59|3.61|Female|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|25.29|4.71|  Male|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
| 8.77| 2.0|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|26.88|3.12|  Male|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|15.04|1.96|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|14.78|3.23|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|10.27|1.71|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|35.26| 5.0|Female|    No|Sun|Dinner|   4|   Yes|Sun-Dinner|
|15.42|1.57|  Male|    No|Sun|Dinner|   2|   Yes|Sun-Dinner|
|18.43| 3.0|  Male|    N

# **Data Cleaning**

In [85]:
# Read New Dataset 
df = spark.read.csv('test2.csv', header=True, inferSchema=True)

# Show Dataset
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



## **Drop All Missing Values**

In [90]:
# Drop All Missing Values
# Default Selection

df.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [91]:
# how = any

df.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [93]:
# how = all

df.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [105]:
# how = any | 
# threshold = 1 (Atleast 1 non-null values)

df.na.drop(how='any', thresh=1).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [106]:
# how = any | 
# threshold = 2 (Min non-null values)

df.na.drop(how='any', thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [107]:
# how = any | 
# threshold = 3 (Min non-null values)

df.na.drop(how='any', thresh=3).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [108]:
# how = any | 
# threshold = 4 (Min non-null values)

df.na.drop(how='any', thresh=4).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [110]:
# how = any | 
# threshold = 4 (Min non-null values)

df.na.drop(how='any',thresh=1,subset=['Name','age']).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



## **Replace Missing Values**

In [113]:
# Fill Missing Values
df.na.fill(18,['Age']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 18|      NULL| 40000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



In [114]:
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [116]:
# Impute missing values
from pyspark.ml.feature import Imputer

imputer = Imputer(
        inputCols=['age','Experience','Salary'],
        outputCols=["{}_imputed".format(c) for c in ['age','Experience','Salary']]
).setStrategy("mean")

imputer.fit(df).transform(df).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         28|                 5|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-

# **Data Filtering**

- EQUAL (==)
- NOT EQUAL TO (~)
- AND (&)
- OR (|)

In [144]:
df = spark.read.csv("test1.csv", header=True, inferSchema=True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## **Equal To**

In [120]:
# Equal To
df.filter(df['Salary'] == 20000).show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
|Sunny| 29|         4| 20000|
| Paul| 24|         3| 20000|
+-----+---+----------+------+



In [121]:
# Equal Or Greater Than
df.filter(df['Salary'] >= 20000).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
+---------+---+----------+------+



## **Not Equal To**

In [124]:
df.filter(~(df["Salary"]==20000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## **Multiple Filters With AND**

In [126]:
df.filter((df["Salary"]<20000) & (df["Salary"]>15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



## **Multiple Filters With OR**

In [127]:
df.filter((df["Salary"]>20000) | (df["Salary"]<15000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



# **Data Sorting**

In [151]:
df.orderBy(df["Age"],ascending=False).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|  Shubham| 23|         2| 18000|
|   Harsha| 21|         1| 15000|
+---------+---+----------+------+



# **Grouping & Aggregation**

In [152]:
df = spark.read.csv("test3.csv", header=True, inferSchema=True)

df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



## **Grouping & Aggregations**

In [143]:
df.groupBy("Name")

GroupedData[grouping expressions: [Name], value: [Name: string, Departments: string ... 1 more field], type: GroupBy]

In [131]:
# Summation of Salary by Name
df.groupBy("Name").sum("Salary").show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [134]:
# Max Salary By Department
df.groupBy('Departments').max('Salary').show()

+------------+-----------+
| Departments|max(Salary)|
+------------+-----------+
|         IOT|      10000|
|    Big Data|       5000|
|Data Science|      20000|
+------------+-----------+



In [135]:
# Mean Salary By Department
df.groupBy('Departments').mean('Salary').show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [136]:
# Count By Department
df.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



## **Aggregation Only** 

In [137]:
df.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



# **References**

- https://github.com/krishnaik06/Pyspark-With-Python 
- https://www.youtube.com/watch?v=_C8kWso4ne4