In [16]:
from pyspark.sql import SparkSession

In [17]:
spark = SparkSession.builder.appName('miss').getOrCreate()

In [18]:
df = spark.read.csv('data/ContainsNull.csv',
                   header=True, inferSchema=True)

In [19]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [20]:
# you can use na method either to drop, fill ,or do something with that missing data
# drop drops any row contains null values
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [21]:
# thresh drops the rows if have less that certain number 
# this means that rows that have less than two null values will be droped
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [22]:
# how = 'all' means to drop the row of all values are null
# how = 'any' means drop the row if contains any null value
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [23]:
# subset matters only with Sales column , it drop any row that has null value in sales column
df.na.drop(subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [24]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [25]:
# it fills in this value to any string column contains null value
df.na.fill("FILL Value").show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL Value| null|
|emp3|FILL Value|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [26]:
# it fills in this value to any int column contains null value
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [27]:
df.na.fill("FILL Value",subset=['Name']).show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL Value| null|
|emp3|FILL Value|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [28]:
from pyspark.sql.functions import mean

In [34]:
mean_val = df.select(mean(df["Sales"])).collect()
mean_sales = mean_val[0][0]
mean_sales

400.5

In [36]:
 df.na.fill(mean_sales,subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

