In [0]:
!pip install pyspark
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("tutorials").getOrCreate()

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-3c77d121-ce11-48bc-bcc6-84bcaa893839/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
df=spark.read.csv("/FileStore/tables/PySparkTutoria/sample3.csv", inferSchema=True, header=True)
df.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|   Subham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [0]:
# Remove records containing all the null values
df.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|   Subham| 23|         2| 18000|
+---------+---+----------+------+



In [0]:
# Parameters of na.drop()
# 1) how. the value of how can be either any or all. any suggest that if there are any null values in the row then it will be dropped. Whereas all suggest that if all the values in a certain row is null then it will be dropped.

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|   Subham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [0]:
df.na.drop(how="all").show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|   Subham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [0]:
df.na.drop(how="any").show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|   Subham| 23|         2| 18000|
+---------+---+----------+------+



In [0]:
# 2) thresh. thresh takes an integer as its value. It suggest the number of non null values to be present in a row to prevent it from getting dropped.

In [0]:
df.na.drop(how="any",thresh=2).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|   Subham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [0]:
# 3) subset. Subset is used when we check for null values in a column. For example the following example will be deleting all the rows where the Experience is null.

In [0]:
df.na.drop(subset=["Experience"]).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|   Subham| 23|         2| 18000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



In [0]:
# Filling null values with something else
# There are 2 parameters here. 
# 1) value: this is mandatory as it replaces null with that value.
# 2) subset: this is again similar to na.drop()

In [0]:
df.na.fill("Missing","Name").show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|   Subham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|  Missing|  34|        10| 38000|
|  Missing|  36|      null|  null|
+---------+----+----------+------+



In [0]:
# Handling missing values of a specific columns based on the mean or median of that specific column. We need an inputer function for that

In [0]:
# Replacing the missing values in Experience with the mean of the Experience column
from pyspark.ml.feature import Imputer
imputer=Imputer(
    inputCols=['Experience'],
    outputCols=['Experience_Imputed']
).setStrategy('mean')
imputer.fit(df).transform(df).show()

+---------+----+----------+------+------------------+
|     Name| Age|Experience|Salary|Experience_Imputed|
+---------+----+----------+------+------------------+
|    Krish|  31|        10| 30000|                10|
|Sudhanshu|  30|         8| 25000|                 8|
|    Sunny|  29|         4| 20000|                 4|
|     Paul|  24|         3| 20000|                 3|
|   Harsha|  21|         1| 15000|                 1|
|   Subham|  23|         2| 18000|                 2|
|   Mahesh|null|      null| 40000|                 5|
|     null|  34|        10| 38000|                10|
|     null|  36|      null|  null|                 5|
+---------+----+----------+------+------------------+

