### Pyspark Handling missing values
- Dropping columns
- Dropping rows
- Various paramenter in Dropping functionalities
- Handling Missing values by mean, median and mode

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('missing values').getOrCreate()

22/07/27 17:44:56 WARN Utils: Your hostname, Ayushmans-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.66 instead (on interface en0)
22/07/27 17:44:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/27 17:44:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/27 17:44:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df_spark = spark.read.csv('Pyspark-With-Python-main/test2.csv', header=True, inferSchema=True)

In [7]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
### drop column
df_spark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [17]:
### dropping null values rows
df_spark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [20]:
###.drop(how)
## all- all values in a row should be null
## any - if a row contains even if one null it drops it
df_spark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [25]:
###.drop(thres)
## thres = 2 means at least 2 non-null values should be present otherwise it will drop it
df_spark.na.drop(thresh=3).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



In [27]:
### .drop(subset)
### subset provides the subset of columns for which the drop method works
### subset = ['Experience'] will drop rows where Experience has null values
df_spark.na.drop(subset=['Experience']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



### Filling Missing Values

In [29]:
### .fill(value)
### value with which to fill the null entries
### subset - which columns to apply the function to... same as drop
df_spark.na.fill(value='Missing Values').show()

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [33]:
## Replacing nulls with mean values
## we need Imputer function for this
from pyspark.ml.feature import Imputer
imputer = Imputer(
        inputCols=['age','Experience', 'Salary'],
        outputCols=['{}_imputeds'.format(a) for a in ['age', 'Experience', 'Salary']]
).setStrategy('median')

In [34]:
## Add imputation cols to df
imputer.fit(df_spark).transform(df_spark).show()

+---------+----+----------+------+------------+-------------------+---------------+
|     Name| age|Experience|Salary|age_imputeds|Experience_imputeds|Salary_imputeds|
+---------+----+----------+------+------------+-------------------+---------------+
|    Krish|  31|        10| 30000|          31|                 10|          30000|
|Sudhanshu|  30|         8| 25000|          30|                  8|          25000|
|    Sunny|  29|         4| 20000|          29|                  4|          20000|
|     Paul|  24|         3| 20000|          24|                  3|          20000|
|   Harsha|  21|         1| 15000|          21|                  1|          15000|
|  Shubham|  23|         2| 18000|          23|                  2|          18000|
|   Mahesh|null|      null| 40000|          29|                  4|          40000|
|     null|  34|        10| 38000|          34|                 10|          38000|
|     null|  36|      null|  null|          36|                  4|         