### Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows based on null values
- Various Parameter In Dropping functionalities
- Handling Missing values by Mean, MEdian And Mode

In [2]:
# Connecting google colab with drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Importing pyspark
import pyspark

In [4]:
# Importing the SparkSession and building the spark context object with 'HandNullVal' app name as spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandNullVal').getOrCreate()

In [5]:
# Check the details of the spark context object
spark

In [6]:
# Storing the file path for reading the 'PySpark_test2.csv' file
file_path = '/content/drive/MyDrive/Datasets/PySpark_test2.csv'

In [15]:
# Read the 'PySpark_test2.csv' file with first row as header and with the inferschema = True option
# Name the spark data frame as df_pyspark
df_org = spark.read.csv(file_path, header = True, inferSchema = True)

In [16]:
# Check the schema of the 'PySpark_test2.csv' file
df_org.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [17]:
# Get the df_pyspark data frame
df_org.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [18]:
# Drop the columns from df_pyspark: drop the 'Name' column
# Remember it is not an inplace operation, need to assign it to a variable
df_dropname = df_org.drop('Name')

In [19]:
# Check the df_pyspark dataframe after dropping the 'Name' column
# The .show() function is used
df_dropname.show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|NULL|      NULL| 40000|
|  34|        10| 38000|
|  36|      NULL|  NULL|
+----+----------+------+



In [20]:
# Handling missing values: Drop rows having null entries
# To do that .na() is used with drop() function
# By using the same .na(), we can fill, replace or remove entries
# This is not an inplace operation, need to assign the result in a variable
df_dropnullrows = df_org.na.drop()

In [21]:
# Check the df_pyspark data frame with reduced number of rows
df_dropnullrows.show() # It has no missing values

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [22]:
# Use the how = 'any' parameter for droping the missing values from df_org
# If a row contains atleast one missing value, then it will drop the row
df_dropany = df_org.na.drop(how = 'any')

In [24]:
# Check the result of the df_dropany dataframe
df_dropany.show() # It is same as the result we get when we drop rows having missing values

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [25]:
# Use the how = 'all' parameter for droping the missing values from df_org
# If a row contains only missing value, then only it will drop the row, for one non null value: it will not drop the row
df_dropall = df_org.na.drop(how = 'all')

In [27]:
# Check the result of the df_dropany dataframe
df_dropall.show() # It gives the org data frame as no row have all missing value

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [33]:
# Droping rows with specified threshold values
# If threshold value is set as 3: means it will drop the rows where atleast 3 non-null values are present
# It will drop the rows having name 'Mahesh' and the last column
# So, that row will be deleted. Note, we are using the how = 'any' parameter with the threshold value
df_dropthresh3 = df_org.na.drop(how = 'any',thresh = 3)

In [34]:
# Check the result of threshold droping with threshold 3
df_dropthresh3.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [35]:
# Droping columns from df_org with specified subset
# This is useful to drop rows from a specific column
# Suppose, we want to drop null values from the 'age' column with how is set as any and with no threshold
df_dropsubset = df_org.na.drop(how = 'any', subset=['age'])

In [37]:
# Check the result of the df_dropsubset dataframe object
df_dropsubset.show() # The row having name as Mahesh is dropped

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



In [41]:
# Filling the Missing Value
# The .na() function provies another function called fill() that fills the missinge values with specified
# Suppose we want to fill all the missing values with  the string entry 'Missing'
# Store the result in a variable called df_fillmiss
df_fillmissname = df_org.na.fill('Missing')

In [42]:
# Check the df_fillmiss data frame that has no missing values
df_fillmissname.show() # See as the 'Missing' is a string, it only fills the name column

# To fill the age, Experience and Salary column, we need to specify some interger value

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|  Missing|  34|        10| 38000|
|  Missing|  36|      NULL|  NULL|
+---------+----+----------+------+



In [43]:
# Fill the interger missing values with an interger say 20
# This will fill the age, Experience and Salary columns
df_fillmissint = df_fillmissname.na.fill(20)
df_fillmissint.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 20|        20| 40000|
|  Missing| 34|        10| 38000|
|  Missing| 36|        20|    20|
+---------+---+----------+------+



In [46]:
# Using the subset method of fill() function to fill specific entries
# Use the 'age' and 'Experience' column for this
df_fillage_exp = df_org.na.fill(20, ['age','Experience'])
df_fillage_exp.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 20|        20| 40000|
|     NULL| 34|        10| 38000|
|     NULL| 36|        20|  NULL|
+---------+---+----------+------+



In [47]:
# Get the original pyspark dataframe
df_org.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [48]:
# Get the Schema of the df_org data frame
df_org.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [49]:
# Importing the imputer function
# This will help to fill the column missing values with the mean, mdeian or mode of that specific column
# Fill the columns 'age', 'Experience' and 'Salary' with their column mdeian
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [51]:
# Fit and transform the df_ord to the above defined imputer object to create a data frame with out any missing values
df_imputed = imputer.fit(df_org).transform(df_org)

In [52]:
# Check the result of the imputer fit and transform function
df_imputed.show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+---------+----+----------+-