# Pyspark Handling Missing Values
- dropping columns
- dropping rows
- various parameter in dropping functionalities
- handling missing values by Mean, MEdian and Mode


In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Practice').getOrCreate()
spark

In [9]:
df_pyspark = spark.read.csv('tut3-clearning.csv',header=True,inferSchema=True)

In [10]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|    Sam|  30|         8| 25000|
|  Sunny|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [12]:
# drop column
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [13]:
# drop rows based on null value
df_pyspark.na.drop().show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Krish| 31|        10| 30000|
|    Sam| 30|         8| 25000|
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [16]:
### any = how
df_pyspark.na.drop(how ='any').show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Krish| 31|        10| 30000|
|    Sam| 30|         8| 25000|
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [21]:
# at least three non-null value
df_pyspark.na.drop(thresh=0).show()


+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|    Sam|  30|         8| 25000|
|  Sunny|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [22]:
df_pyspark.na.drop(how='any',subset=['Experience']).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Krish| 31|        10| 30000|
|    Sam| 30|         8| 25000|
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
|   null| 34|        10| 38000|
+-------+---+----------+------+



In [23]:
### Filling the missing value
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|           Sam|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [25]:
from pyspark.ml.feature import Imputer


Imputation estimator for completing missing values


In [43]:
# null values have been replaced by mean
imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols=[item + "_imputed" for item in df_pyspark.columns[1:]]
).setStrategy("mean")

In [44]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Krish|  31|        10| 30000|         31|                10|         30000|
|    Sam|  30|         8| 25000|         30|                 8|         25000|
|  Sunny|  29|         4| 20000|         29|                 4|         20000|
|   Paul|  24|         3| 20000|         24|                 3|         20000|
| Harsha|  21|         1| 15000|         21|                 1|         15000|
|Shubham|  23|         2| 18000|         23|                 2|         18000|
| Mahesh|null|      null| 40000|         28|                 5|         40000|
|   null|  34|        10| 38000|         34|                10|         38000|
|   null|  36|      null|  null|         36|                 5|         25750|
+-------+----+----------+------+-----------+--------