In [33]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer

In [2]:
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [3]:
spark = SparkSession.builder.appName("Handle Dataframe").getOrCreate()

In [4]:
spark

In [24]:
df_pyspark = spark.read.csv("Datasets/testfile2.csv", header=True, inferSchema=True)

In [25]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|     Yash|  23|         1|  8000|
|    Mohan|  25|         4| 25000|
|Sudhanshu|  30|         8| 35000|
|   Mahesh|NULL|         6| 32000|
|    Krish|  21|      NULL|  6000|
|    Harsh|  42|        16| 65000|
|  Shubham|  56|        23| 77000|
|     NULL|  26|      NULL|  NULL|
|     NULL|NULL|         6| 56000|
+---------+----+----------+------+



In [8]:
# drop nan or null value whole row
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|     Yash| 23|         1|  8000|
|    Mohan| 25|         4| 25000|
|Sudhanshu| 30|         8| 35000|
|    Harsh| 42|        16| 65000|
|  Shubham| 56|        23| 77000|
+---------+---+----------+------+



In [9]:
# any--how in this it will only drop if in a row we have all the values are null
df_pyspark.na.drop(how="all").show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|     Yash|  23|         1|  8000|
|    Mohan|  25|         4| 25000|
|Sudhanshu|  30|         8| 35000|
|   Mahesh|NULL|         6| 32000|
|    Krish|  21|      NULL|  6000|
|    Harsh|  42|        16| 65000|
|  Shubham|  56|        23| 77000|
|     NULL|  26|      NULL|  NULL|
|     NULL|NULL|         6| 56000|
+---------+----+----------+------+



In [10]:
# any--how in this it will drop the rows even if one null value it is bydefault in normal drop
df_pyspark.na.drop(how="any").show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|     Yash| 23|         1|  8000|
|    Mohan| 25|         4| 25000|
|Sudhanshu| 30|         8| 35000|
|    Harsh| 42|        16| 65000|
|  Shubham| 56|        23| 77000|
+---------+---+----------+------+



In [17]:
# threshold for eg 2 then it will not delete the row where with null value atleast 2 non null values also there
# it will check wherever is null value row there should be 2 non null values should be there if not then it will be deleted
df_pyspark.na.drop(how="any", thresh=2).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|     Yash|  23|         1|  8000|
|    Mohan|  25|         4| 25000|
|Sudhanshu|  30|         8| 35000|
|   Mahesh|NULL|         6| 32000|
|    Krish|  21|      NULL|  6000|
|    Harsh|  42|        16| 65000|
|  Shubham|  56|        23| 77000|
|     NULL|NULL|         6| 56000|
+---------+----+----------+------+



In [18]:
# threshold for eg 3 then it will not delete the row where with null value atleast 2 non null values also there
# it will check wherever is null value row there should be 3 non null values should be there if not then it will be deleted
df_pyspark.na.drop(how="any", thresh=3).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|     Yash|  23|         1|  8000|
|    Mohan|  25|         4| 25000|
|Sudhanshu|  30|         8| 35000|
|   Mahesh|NULL|         6| 32000|
|    Krish|  21|      NULL|  6000|
|    Harsh|  42|        16| 65000|
|  Shubham|  56|        23| 77000|
+---------+----+----------+------+



In [19]:
# Subset it will delete null value rows from only selected column
df_pyspark.na.drop(how="any", subset=["Experience"]).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|     Yash|  23|         1|  8000|
|    Mohan|  25|         4| 25000|
|Sudhanshu|  30|         8| 35000|
|   Mahesh|NULL|         6| 32000|
|    Harsh|  42|        16| 65000|
|  Shubham|  56|        23| 77000|
|     NULL|NULL|         6| 56000|
+---------+----+----------+------+



In [20]:
# Subset it will delete null value rows from only selected column
df_pyspark.na.drop(how="any", subset=["Age"]).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|     Yash| 23|         1|  8000|
|    Mohan| 25|         4| 25000|
|Sudhanshu| 30|         8| 35000|
|    Krish| 21|      NULL|  6000|
|    Harsh| 42|        16| 65000|
|  Shubham| 56|        23| 77000|
|     NULL| 26|      NULL|  NULL|
+---------+---+----------+------+



In [27]:
## filling missing value
df_pyspark.na.fill("Missing Value").show()
# so the interger value was not filled only the strings were filled
# to fill integer values also we have to inferSchema=False then all the data will be taken as string

+-------------+----+----------+------+
|         Name| Age|Experience|Salary|
+-------------+----+----------+------+
|         Yash|  23|         1|  8000|
|        Mohan|  25|         4| 25000|
|    Sudhanshu|  30|         8| 35000|
|       Mahesh|NULL|         6| 32000|
|        Krish|  21|      NULL|  6000|
|        Harsh|  42|        16| 65000|
|      Shubham|  56|        23| 77000|
|Missing Value|  26|      NULL|  NULL|
|Missing Value|NULL|         6| 56000|
+-------------+----+----------+------+



In [31]:
df_pyspark = spark.read.csv("Datasets/testfile2.csv", header=True, inferSchema=False)

In [32]:
df_pyspark.na.fill("Missing Value").show()

+-------------+-------------+-------------+-------------+
|         Name|          Age|   Experience|       Salary|
+-------------+-------------+-------------+-------------+
|         Yash|           23|            1|         8000|
|        Mohan|           25|            4|        25000|
|    Sudhanshu|           30|            8|        35000|
|       Mahesh|Missing Value|            6|        32000|
|        Krish|           21|Missing Value|         6000|
|        Harsh|           42|           16|        65000|
|      Shubham|           56|           23|        77000|
|Missing Value|           26|Missing Value|Missing Value|
|Missing Value|Missing Value|            6|        56000|
+-------------+-------------+-------------+-------------+



In [37]:
# filing null values by mean you can choose median or mode also but for this infer=True is must to take numeric value as integer and not as a string

df_pyspark = spark.read.csv("Datasets/testfile2.csv", header=True, inferSchema=True)

imputer = Imputer(inputCols=["Age", "Experience", "Salary"], outputCols=["{}_imputed".format(c) for c in ["Age", "Experience", "Salary"]]).setStrategy("mean")

imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|     Yash|  23|         1|  8000|         23|                 1|          8000|
|    Mohan|  25|         4| 25000|         25|                 4|         25000|
|Sudhanshu|  30|         8| 35000|         30|                 8|         35000|
|   Mahesh|NULL|         6| 32000|         31|                 6|         32000|
|    Krish|  21|      NULL|  6000|         21|                 9|          6000|
|    Harsh|  42|        16| 65000|         42|                16|         65000|
|  Shubham|  56|        23| 77000|         56|                23|         77000|
|     NULL|  26|      NULL|  NULL|         26|                 9|         38000|
|     NULL|NULL|         6| 56000|         31|                 6|         56000|
+---------+----+----------+-