# Missing data

We typically see missing values as NaN (Not A Number).

In [1]:
import org.apache.spark.sql.functions._

## Dataframe with missing values '?' and 'NaN'

In [2]:
 val df = List(
    ("blue", 6.0, Double.NaN , 6.0),
    ("green", Double.NaN, 4.0, Double.NaN),
    ("red", 2.0, 3.0, 5.0)
    ).toDF("index","ball","mug","pen")

df.show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|NaN|6.0|
|green| NaN|4.0|NaN|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



df = [index: string, ball: double ... 2 more fields]


[index: string, ball: double ... 2 more fields]

## Drop NaNs

In [3]:
df.na.drop().show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



## Replace 'NaN' with '0'

In [4]:
df.na.fill(0).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|0.0|6.0|
|green| 0.0|4.0|0.0|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



In [5]:
df.na.fill(0,Seq("pen","mug")).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|0.0|6.0|
|green| NaN|4.0|0.0|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



## Replace 'NaN' with the mean

In [6]:
val M = df.select("pen").na.drop().select(mean("pen")).as[Double].collect

M = Array(5.5)


[5.5]

In [7]:
df.na.fill(M(0),Seq("pen")).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|NaN|6.0|
|green| NaN|4.0|5.5|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



In [8]:
df.na.fill(Map("pen" -> M(0), "mug" -> 0, "ball" -> 1)).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|0.0|6.0|
|green| 1.0|4.0|5.5|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+



## Replace values

In [9]:
df.na.replace("*", Map(6.0 -> 7.0, 2.0 -> 3.5)).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 7.0|NaN|7.0|
|green| NaN|4.0|NaN|
|  red| 3.5|3.0|5.0|
+-----+----+---+---+



In [10]:
df.na.replace(Seq("pen","mug"), Map(6.0 -> 7.0, 2.0 -> 3.5)).show()

+-----+----+---+---+
|index|ball|mug|pen|
+-----+----+---+---+
| blue| 6.0|NaN|7.0|
|green| NaN|4.0|NaN|
|  red| 2.0|3.0|5.0|
+-----+----+---+---+

