In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(appName='NullData').getOrCreate()
spark = SparkSession(sparkContext=sc)

In [4]:
df_nulls = spark.read \
    .option('header', True) \
    .option('inferSchema', True) \
    .csv(path='data/ContainsNull.csv')

df_nulls.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [5]:
df_nulls.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
df_nulls.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
df_nulls.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
df_nulls.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df_nulls.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [11]:
df_nulls.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
df_nulls.na.fill('Fill Val').show()

+----+--------+-----+
|  Id|    Name|Sales|
+----+--------+-----+
|emp1|    John| null|
|emp2|Fill Val| null|
|emp3|Fill Val|345.0|
|emp4|   Cindy|456.0|
+----+--------+-----+



In [15]:
df_nulls.na.fill(100).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|100.0|
|emp2| null|100.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [16]:
df_nulls.na.fill('No Name', subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [18]:
df_nulls.na.fill({'Name':'NA', 'Sales':'100'}).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|100.0|
|emp2|   NA|100.0|
|emp3|   NA|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [19]:
from pyspark.sql.functions import mean

In [26]:
mean_sales = df_nulls.agg({'Sales': 'mean'}).collect()[0][0]
mean_sales

400.5

In [27]:
df_nulls.na.fill(mean_sales, subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

