# Handling with missing values

Let us read the csv file, with missing values.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing_values').getOrCreate()

In [2]:
# Reading the file with missing values
df_pyspark = spark.read.csv('data/names_and_ages_missing_val.csv', header = True , inferSchema = True , sep = ';')
df_pyspark.show()

+-------+----+----------+-----------+
|   Name| Age|Experience|Salary(USD)|
+-------+----+----------+-----------+
|  Alice|  25|         2|       1882|
|    Bob|  30|         4|       6503|
|Charlie|  22|         7|       6914|
|  David|  35|        12|       8108|
|   Emma|  28|         9|       5268|
|  Frank|NULL|      NULL|       NULL|
|  Grace|  23|         3|       6035|
|  Henry|  32|        14|       7443|
|  Irene|NULL|        25|       9129|
|   Jack|  33|      NULL|       NULL|
|  Karen|  26|         3|       6901|
|    Leo|  29|         1|       5013|
|   NULL|  31|         0|       2893|
| Nathan|  37|      NULL|       5647|
|   NULL|  24|         3|       7004|
|   Paul|  38|         1|       7891|
|  Quinn|  21|        14|       8890|
| Rachel|  34|         3|       1872|
|    Sam|NULL|         6|       4916|
| Taylor|  36|      NULL|       5554|
+-------+----+----------+-----------+
only showing top 20 rows



## Using the method .drop

In [3]:
# Drop the rows with NULL values
df_pyspark.na.drop().show()

+--------+---+----------+-----------+
|    Name|Age|Experience|Salary(USD)|
+--------+---+----------+-----------+
|   Alice| 25|         2|       1882|
|     Bob| 30|         4|       6503|
| Charlie| 22|         7|       6914|
|   David| 35|        12|       8108|
|    Emma| 28|         9|       5268|
|   Grace| 23|         3|       6035|
|   Henry| 32|        14|       7443|
|   Karen| 26|         3|       6901|
|     Leo| 29|         1|       5013|
|    Paul| 38|         1|       7891|
|   Quinn| 21|        14|       8890|
|  Rachel| 34|         3|       1872|
|Victoria| 20|         6|       6827|
|  Xander| 28|         2|       7977|
| Yasmine| 31|         8|       3342|
| Zachary| 29|         2|       5914|
|     Ava| 26|         4|       2593|
|Benjamin| 32|         3|       9328|
|   Chloe| 23|         6|       2089|
|  Daniel| 35|        21|       6377|
+--------+---+----------+-----------+
only showing top 20 rows



In [4]:
# To drop all the columns which has NULL values in all the line,
# but this command doesn't drop the rows with one, two or three null entries
# in the same line
df_pyspark.na.drop(how = 'all').show()

+-------+----+----------+-----------+
|   Name| Age|Experience|Salary(USD)|
+-------+----+----------+-----------+
|  Alice|  25|         2|       1882|
|    Bob|  30|         4|       6503|
|Charlie|  22|         7|       6914|
|  David|  35|        12|       8108|
|   Emma|  28|         9|       5268|
|  Frank|NULL|      NULL|       NULL|
|  Grace|  23|         3|       6035|
|  Henry|  32|        14|       7443|
|  Irene|NULL|        25|       9129|
|   Jack|  33|      NULL|       NULL|
|  Karen|  26|         3|       6901|
|    Leo|  29|         1|       5013|
|   NULL|  31|         0|       2893|
| Nathan|  37|      NULL|       5647|
|   NULL|  24|         3|       7004|
|   Paul|  38|         1|       7891|
|  Quinn|  21|        14|       8890|
| Rachel|  34|         3|       1872|
|    Sam|NULL|         6|       4916|
| Taylor|  36|      NULL|       5554|
+-------+----+----------+-----------+
only showing top 20 rows



In [5]:
# To drop all the columns which contains at least one NULL value in the line
df_pyspark.na.drop(how = 'any').show()

+--------+---+----------+-----------+
|    Name|Age|Experience|Salary(USD)|
+--------+---+----------+-----------+
|   Alice| 25|         2|       1882|
|     Bob| 30|         4|       6503|
| Charlie| 22|         7|       6914|
|   David| 35|        12|       8108|
|    Emma| 28|         9|       5268|
|   Grace| 23|         3|       6035|
|   Henry| 32|        14|       7443|
|   Karen| 26|         3|       6901|
|     Leo| 29|         1|       5013|
|    Paul| 38|         1|       7891|
|   Quinn| 21|        14|       8890|
|  Rachel| 34|         3|       1872|
|Victoria| 20|         6|       6827|
|  Xander| 28|         2|       7977|
| Yasmine| 31|         8|       3342|
| Zachary| 29|         2|       5914|
|     Ava| 26|         4|       2593|
|Benjamin| 32|         3|       9328|
|   Chloe| 23|         6|       2089|
|  Daniel| 35|        21|       6377|
+--------+---+----------+-----------+
only showing top 20 rows



In [6]:
# threshold of only ONE null value
df_pyspark.na.drop(how = 'any' , thresh = 1).show()

+-------+----+----------+-----------+
|   Name| Age|Experience|Salary(USD)|
+-------+----+----------+-----------+
|  Alice|  25|         2|       1882|
|    Bob|  30|         4|       6503|
|Charlie|  22|         7|       6914|
|  David|  35|        12|       8108|
|   Emma|  28|         9|       5268|
|  Frank|NULL|      NULL|       NULL|
|  Grace|  23|         3|       6035|
|  Henry|  32|        14|       7443|
|  Irene|NULL|        25|       9129|
|   Jack|  33|      NULL|       NULL|
|  Karen|  26|         3|       6901|
|    Leo|  29|         1|       5013|
|   NULL|  31|         0|       2893|
| Nathan|  37|      NULL|       5647|
|   NULL|  24|         3|       7004|
|   Paul|  38|         1|       7891|
|  Quinn|  21|        14|       8890|
| Rachel|  34|         3|       1872|
|    Sam|NULL|         6|       4916|
| Taylor|  36|      NULL|       5554|
+-------+----+----------+-----------+
only showing top 20 rows



In [7]:
# threshold of only TWO null value
df_pyspark.na.drop(how = 'any' , thresh = 2).show()

+-------+----+----------+-----------+
|   Name| Age|Experience|Salary(USD)|
+-------+----+----------+-----------+
|  Alice|  25|         2|       1882|
|    Bob|  30|         4|       6503|
|Charlie|  22|         7|       6914|
|  David|  35|        12|       8108|
|   Emma|  28|         9|       5268|
|  Grace|  23|         3|       6035|
|  Henry|  32|        14|       7443|
|  Irene|NULL|        25|       9129|
|   Jack|  33|      NULL|       NULL|
|  Karen|  26|         3|       6901|
|    Leo|  29|         1|       5013|
|   NULL|  31|         0|       2893|
| Nathan|  37|      NULL|       5647|
|   NULL|  24|         3|       7004|
|   Paul|  38|         1|       7891|
|  Quinn|  21|        14|       8890|
| Rachel|  34|         3|       1872|
|    Sam|NULL|         6|       4916|
| Taylor|  36|      NULL|       5554|
|   NULL|  42|         7|       2574|
+-------+----+----------+-----------+
only showing top 20 rows



In [8]:
# threshold of only THREE null value
df_pyspark.na.drop(how = 'any' , thresh = 3).show()

+--------+----+----------+-----------+
|    Name| Age|Experience|Salary(USD)|
+--------+----+----------+-----------+
|   Alice|  25|         2|       1882|
|     Bob|  30|         4|       6503|
| Charlie|  22|         7|       6914|
|   David|  35|        12|       8108|
|    Emma|  28|         9|       5268|
|   Grace|  23|         3|       6035|
|   Henry|  32|        14|       7443|
|   Irene|NULL|        25|       9129|
|   Karen|  26|         3|       6901|
|     Leo|  29|         1|       5013|
|    NULL|  31|         0|       2893|
|  Nathan|  37|      NULL|       5647|
|    NULL|  24|         3|       7004|
|    Paul|  38|         1|       7891|
|   Quinn|  21|        14|       8890|
|  Rachel|  34|         3|       1872|
|     Sam|NULL|         6|       4916|
|  Taylor|  36|      NULL|       5554|
|    NULL|  42|         7|       2574|
|Victoria|  20|         6|       6827|
+--------+----+----------+-----------+
only showing top 20 rows



In [10]:
# To drop according to a subset
df_pyspark.na.drop(how = 'any', subset = ['Experience']).show()

+--------+----+----------+-----------+
|    Name| Age|Experience|Salary(USD)|
+--------+----+----------+-----------+
|   Alice|  25|         2|       1882|
|     Bob|  30|         4|       6503|
| Charlie|  22|         7|       6914|
|   David|  35|        12|       8108|
|    Emma|  28|         9|       5268|
|   Grace|  23|         3|       6035|
|   Henry|  32|        14|       7443|
|   Irene|NULL|        25|       9129|
|   Karen|  26|         3|       6901|
|     Leo|  29|         1|       5013|
|    NULL|  31|         0|       2893|
|    NULL|  24|         3|       7004|
|    Paul|  38|         1|       7891|
|   Quinn|  21|        14|       8890|
|  Rachel|  34|         3|       1872|
|     Sam|NULL|         6|       4916|
|    NULL|  42|         7|       2574|
|Victoria|  20|         6|       6827|
|  Walter|NULL|         5|       4873|
|  Xander|  28|         2|       7977|
+--------+----+----------+-----------+
only showing top 20 rows



## Filling the missing values

In [12]:
df_pyspark.na.fill('Missing Values', ['Experience' , 'Age']).show()

+-------+----+----------+-----------+
|   Name| Age|Experience|Salary(USD)|
+-------+----+----------+-----------+
|  Alice|  25|         2|       1882|
|    Bob|  30|         4|       6503|
|Charlie|  22|         7|       6914|
|  David|  35|        12|       8108|
|   Emma|  28|         9|       5268|
|  Frank|NULL|      NULL|       NULL|
|  Grace|  23|         3|       6035|
|  Henry|  32|        14|       7443|
|  Irene|NULL|        25|       9129|
|   Jack|  33|      NULL|       NULL|
|  Karen|  26|         3|       6901|
|    Leo|  29|         1|       5013|
|   NULL|  31|         0|       2893|
| Nathan|  37|      NULL|       5647|
|   NULL|  24|         3|       7004|
|   Paul|  38|         1|       7891|
|  Quinn|  21|        14|       8890|
| Rachel|  34|         3|       1872|
|    Sam|NULL|         6|       4916|
| Taylor|  36|      NULL|       5554|
+-------+----+----------+-----------+
only showing top 20 rows



In [None]:
#42:45