# Pyspark Handling Missing Values

- Dropping Columns
- Dropping Rows
- Various Parameter in Dropping functionalities
- Handling Missing Values by Mean, Median and Mode

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

25/02/26 17:32:30 WARN Utils: Your hostname, Ardens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.100.50 instead (on interface en0)
25/02/26 17:32:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/26 17:32:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df_pyspark = spark.read.csv('name_data.csv', header=True, inferSchema=True)

In [3]:
df_pyspark.show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|             NULL|  35|        15| 58000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
|             NULL|  36|      NULL| 11000|
+-----------------+----+----------+------+



In [4]:
## drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  32|        10|100000|
|  31|        12| 34000|
|  35|        15| 58000|
|  28|        14| 65000|
|  30|      NULL| 79000|
|  29|         8|  NULL|
|  34|         9| 66000|
|NULL|         7| 99000|
|  36|      NULL| 11000|
+----+----------+------+



In [5]:
df_pyspark.show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|             NULL|  35|        15| 58000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
|             NULL|  36|      NULL| 11000|
+-----------------+----+----------+------+



In [6]:
# Drop null rows
df_pyspark.na.drop().show()  

+---------------+---+----------+------+
|           Name|Age|Experience|Salary|
+---------------+---+----------+------+
|    Kaizer Tena| 32|        10|100000|
|     Lizel Cruz| 31|        12| 34000|
|Mabel Sarmiento| 28|        14| 65000|
|Aaron Sarmiento| 34|         9| 66000|
+---------------+---+----------+------+



In [7]:
# any=how drop rows where at least in contains one null value

df_pyspark.na.drop(how="any").show()

+---------------+---+----------+------+
|           Name|Age|Experience|Salary|
+---------------+---+----------+------+
|    Kaizer Tena| 32|        10|100000|
|     Lizel Cruz| 31|        12| 34000|
|Mabel Sarmiento| 28|        14| 65000|
|Aaron Sarmiento| 34|         9| 66000|
+---------------+---+----------+------+



In [8]:
## threshold. At least 4 non values should be present
df_pyspark.na.drop(how="any",thresh=4).show()


+---------------+---+----------+------+
|           Name|Age|Experience|Salary|
+---------------+---+----------+------+
|    Kaizer Tena| 32|        10|100000|
|     Lizel Cruz| 31|        12| 34000|
|Mabel Sarmiento| 28|        14| 65000|
|Aaron Sarmiento| 34|         9| 66000|
+---------------+---+----------+------+



In [9]:
## Subset. Select a column where there a non values and it will drop those rows
df_pyspark.na.drop(how="any",subset=['Name']).show()


+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
+-----------------+----+----------+------+



In [10]:
# Filling the Missing Values
df_pyspark.na.fill('Missing Values').show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|   Missing Values|  35|        15| 58000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
|   Missing Values|  36|      NULL| 11000|
+-----------------+----+----------+------+



In [11]:
# Filling the Missing Values. Select a specific column only
df_pyspark.na.fill('Missing Values',['Name']).show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|   Missing Values|  35|        15| 58000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
|   Missing Values|  36|      NULL| 11000|
+-----------------+----+----------+------+



In [12]:
df_pyspark.show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|      Kaizer Tena|  32|        10|100000|
|       Lizel Cruz|  31|        12| 34000|
|             NULL|  35|        15| 58000|
|  Mabel Sarmiento|  28|        14| 65000|
| Aerold Sarmiento|  30|      NULL| 79000|
|  Arden Sarmiento|  29|         8|  NULL|
|  Aaron Sarmiento|  34|         9| 66000|
|Abigael Sarmiento|NULL|         7| 99000|
|             NULL|  36|      NULL| 11000|
+-----------------+----+----------+------+



In [19]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [None]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-----------------+----+----------+------+-----------+------------------+--------------+
|             Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-----------------+----+----------+------+-----------+------------------+--------------+
|      Kaizer Tena|  32|        10|100000|         32|                10|        100000|
|       Lizel Cruz|  31|        12| 34000|         31|                12|         34000|
|             NULL|  35|        15| 58000|         35|                15|         58000|
|  Mabel Sarmiento|  28|        14| 65000|         28|                14|         65000|
| Aerold Sarmiento|  30|      NULL| 79000|         30|                10|         79000|
|  Arden Sarmiento|  29|         8|  NULL|         29|                 8|         64000|
|  Aaron Sarmiento|  34|         9| 66000|         34|                 9|         66000|
|Abigael Sarmiento|NULL|         7| 99000|         31|                 7|         99000|
|             NULL|  

25/02/26 18:14:31 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 904335 ms exceeds timeout 120000 ms
25/02/26 18:14:31 WARN SparkContext: Killing executors is not supported by current scheduler.
25/02/26 18:14:32 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o