In [27]:
import findspark
findspark.init()

# REplace empty values with Null
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Empty Values').getOrCreate()
data = [("","CA","",None), ("Julia","","R",32),("Robert","","C",33),("","NJ","",None)]
df =spark.createDataFrame(data,["firstname","state","lastname","age"])
df.printSchema()
df.show()


root
 |-- firstname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: long (nullable = true)

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|         |   CA|        |null|
|    Julia|     |       R|  32|
|   Robert|     |       C|  33|
|         |   NJ|        |null|
+---------+-----+--------+----+



In [28]:
#Replace empty string with None value
from pyspark.sql.functions import col,when
df.withColumn("firstname",when(col("firstname")=="", None,).otherwise(col("firstname"))).show()


+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|     null|   CA|        |null|
|    Julia|     |       R|  32|
|   Robert|     |       C|  33|
|     null|   NJ|        |null|
+---------+-----+--------+----+



In [29]:
#Replace empty string with None for all columns
df2=df.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in df.columns])
df2.show()

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|     null|   CA|    null|null|
|    Julia| null|       R|  32|
|   Robert| null|       C|  33|
|     null|   NJ|    null|null|
+---------+-----+--------+----+



In [30]:
#Replace empty string with None on selected columns
replaceCols=["firstname","lastname"]
df3=df.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in replaceCols])
df3.show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|     null|    null|
|    Julia|       R|
|   Robert|       C|
|     null|    null|
+---------+--------+



In [31]:

#PySpark provides DataFrame.fillna() and DataFrameNaFunctions.fill() 
#to replace NULL/None values. These two are aliases of each other and returns the same results.

df2.fillna(value=0).show()
df2.fillna(value=0,subset=["age"]).show()
df2.na.fill(value=0).show()
df2.na.fill(value=0,subset=["age"]).show()



+---------+-----+--------+---+
|firstname|state|lastname|age|
+---------+-----+--------+---+
|     null|   CA|    null|  0|
|    Julia| null|       R| 32|
|   Robert| null|       C| 33|
|     null|   NJ|    null|  0|
+---------+-----+--------+---+

+---------+-----+--------+---+
|firstname|state|lastname|age|
+---------+-----+--------+---+
|     null|   CA|    null|  0|
|    Julia| null|       R| 32|
|   Robert| null|       C| 33|
|     null|   NJ|    null|  0|
+---------+-----+--------+---+

+---------+-----+--------+---+
|firstname|state|lastname|age|
+---------+-----+--------+---+
|     null|   CA|    null|  0|
|    Julia| null|       R| 32|
|   Robert| null|       C| 33|
|     null|   NJ|    null|  0|
+---------+-----+--------+---+

+---------+-----+--------+---+
|firstname|state|lastname|age|
+---------+-----+--------+---+
|     null|   CA|    null|  0|
|    Julia| null|       R| 32|
|   Robert| null|       C| 33|
|     null|   NJ|    null|  0|
+---------+-----+--------+---+



In [32]:

df2.fillna(value="").show()
df2.na.fill(value="").show()


+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|         |   CA|        |null|
|    Julia|     |       R|  32|
|   Robert|     |       C|  33|
|         |   NJ|        |null|
+---------+-----+--------+----+

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|         |   CA|        |null|
|    Julia|     |       R|  32|
|   Robert|     |       C|  33|
|         |   NJ|        |null|
+---------+-----+--------+----+



In [33]:

df2.fillna("na",["lastname"]).fillna("na",["firstname"]).show()

df2.fillna({"lastname": "na", "firstname": "na"}) .show()


+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|       na|   CA|      na|null|
|    Julia| null|       R|  32|
|   Robert| null|       C|  33|
|       na|   NJ|      na|null|
+---------+-----+--------+----+

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|       na|   CA|      na|null|
|    Julia| null|       R|  32|
|   Robert| null|       C|  33|
|       na|   NJ|      na|null|
+---------+-----+--------+----+



In [34]:

df2.na.fill("unknown",["lastname"]) \
    .na.fill("",["firstname"]).show()

df2.na.fill({"lastname": "unknown", "firstname": ""}) \
    .show()

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|         |   CA| unknown|null|
|    Julia| null|       R|  32|
|   Robert| null|       C|  33|
|         |   NJ| unknown|null|
+---------+-----+--------+----+

+---------+-----+--------+----+
|firstname|state|lastname| age|
+---------+-----+--------+----+
|         |   CA| unknown|null|
|    Julia| null|       R|  32|
|   Robert| null|       C|  33|
|         |   NJ| unknown|null|
+---------+-----+--------+----+



In [38]:
#fillna is used to replace null values and you have '' (empty string) in the firstname column; 
#To replace a general value you can use na.replace method:
df.na.replace('', 'Empty String', 'firstname').show()
df.na.replace({'': 'Empty String'}, 'firstname').show()
df.replace('', 'Empty String', 'firstname').show()


+------------+-----+--------+----+
|   firstname|state|lastname| age|
+------------+-----+--------+----+
|Empty String|   CA|        |null|
|       Julia|     |       R|  32|
|      Robert|     |       C|  33|
|Empty String|   NJ|        |null|
+------------+-----+--------+----+

+------------+------------+------------+----+
|   firstname|       state|    lastname| age|
+------------+------------+------------+----+
|Empty String|          CA|Empty String|null|
|       Julia|Empty String|           R|  32|
|      Robert|Empty String|           C|  33|
|Empty String|          NJ|Empty String|null|
+------------+------------+------------+----+

+------------+-----+--------+----+
|   firstname|state|lastname| age|
+------------+-----+--------+----+
|Empty String|   CA|        |null|
|       Julia|     |       R|  32|
|      Robert|     |       C|  33|
|Empty String|   NJ|        |null|
+------------+-----+--------+----+



In [40]:
# Drop null values
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Handling Null Values').getOrCreate()
data = [("","CA"), ("Julia",""),("Robert",""),("","NJ"),("Ram","TX"),("","")]
df =spark.createDataFrame(data,["name","state"])
df.show()
df2=df.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in df.columns])
df2.show()
# drop all null columns using drop()
df2.na.drop().show()
df2.na.drop("any").show()

+------+-----+
|  name|state|
+------+-----+
|      |   CA|
| Julia|     |
|Robert|     |
|      |   NJ|
|   Ram|   TX|
|      |     |
+------+-----+

+------+-----+
|  name|state|
+------+-----+
|  null|   CA|
| Julia| null|
|Robert| null|
|  null|   NJ|
|   Ram|   TX|
|  null| null|
+------+-----+

+----+-----+
|name|state|
+----+-----+
| Ram|   TX|
+----+-----+

+----+-----+
|name|state|
+----+-----+
| Ram|   TX|
+----+-----+



In [41]:
# Drop rows when all rows are null

df2.na.drop("all").show()

+------+-----+
|  name|state|
+------+-----+
|  null|   CA|
| Julia| null|
|Robert| null|
|  null|   NJ|
|   Ram|   TX|
+------+-----+



In [43]:
#drop those rows that has null values in specific columns
df2.na.drop(subset=["name"]).show()

+------+-----+
|  name|state|
+------+-----+
| Julia| null|
|Robert| null|
|   Ram|   TX|
+------+-----+



In [44]:
# Similar to na.drop
df2.dropna().show()

+----+-----+
|name|state|
+----+-----+
| Ram|   TX|
+----+-----+



In [45]:
# To select rows that have a null value on a selected column use filter() with isNULL()
# Create DataFrame
data = [
    ("James",None,"M"),
    ("Anna","NY","F"),
    ("Julia",None,None)
  ]

columns = ["name","state","gender"]
df = spark.createDataFrame(data,columns)
df.show()

# Using isNull()
df.filter("state is NULL").show()
df.filter(df.state.isNull()).show()


+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
| Anna|   NY|     F|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+



In [46]:

from pyspark.sql.functions import col
df.filter(col("state").isNull()).show()

df.filter("state IS NULL AND gender IS NULL").show()
df.filter(df.state.isNull() & df.gender.isNull()).show()


+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+



In [47]:

from pyspark.sql.functions import isnull
df.select(isnull(df.state)).show()

# Using isNotNull()
from pyspark.sql.functions import col
df.filter("state IS NOT NULL").show()
df.filter("NOT state IS NULL").show()
df.filter(df.state.isNotNull()).show()
df.filter(col("state").isNotNull()).show()

df.na.drop(subset=["state"]).show()


+---------------+
|(state IS NULL)|
+---------------+
|           true|
|          false|
|           true|
+---------------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+



In [48]:

# Using pySpark SQL
df.createOrReplaceTempView("DATA")
spark.sql("SELECT * FROM DATA where STATE IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show()


+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

