In [1]:
from pyspark.sql import functions as F

In [2]:
raw_df = spark.createDataFrame(
    [
        ('Store 1',1,448),
        ('Store 1',2,None),
        ('Store 1',3,499),
        ('Store 1',44,432),
        (None,None,None),
        ('Store 2',1,355),
        ('Store 2',1,355),
        ('Store 2',None,345),
        ('Store 2',3,387),
        ('Store 2',4,312)
    ], 
    [
        'Store','WeekInMonth','Revenue'
    ]
)

In [3]:
raw_df.printSchema()

root
 |-- Store: string (nullable = true)
 |-- WeekInMonth: long (nullable = true)
 |-- Revenue: long (nullable = true)



In [4]:
raw_df.count()

                                                                                

10

In [5]:
raw_df.show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|       null|   null|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [6]:
raw_df.describe().show()



+-------+-------+------------------+------------------+
|summary|  Store|       WeekInMonth|           Revenue|
+-------+-------+------------------+------------------+
|  count|      9|                 8|                 8|
|   mean|   null|             7.375|           391.625|
| stddev|   null|14.841423689890979|62.741960213469355|
|    min|Store 1|                 1|               312|
|    max|Store 2|                44|               499|
+-------+-------+------------------+------------------+



                                                                                

In [10]:
raw_df.select(
    [
        F.count(
            F.when(
                F.isnull(c), c
            )
        ).alias(c)
        for c in raw_df.columns
    ]
).show()

+-----+-----------+-------+
|Store|WeekInMonth|Revenue|
+-----+-----------+-------+
|    1|          2|      2|
+-----+-----------+-------+



In [11]:
from functools import reduce

In [12]:
# show rows with at least one null column
raw_df.where(
    reduce(
        lambda a, b: a | b,
        (
            F.col(c).isNull()
            for c in raw_df.columns
        )
    )
).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          2|   null|
|   null|       null|   null|
|Store 2|       null|    345|
+-------+-----------+-------+



In [13]:
# hide rows with all null columns
noNullRow_df = raw_df.dropna('all')
noNullRow_df.show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [16]:
# hide rows with any null columns (focusing on subset of columns)
raw_df.dropna(how='any', subset=['Store', 'WeekInMonth']).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [17]:
# assign a new variable with rows that have no null columns
valid_df = raw_df.dropna(how='any')
valid_df.show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [18]:
# replace nulls with 0 on Revenue column
raw_df.fillna(0, ['Revenue']).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|      0|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|       null|      0|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [19]:
# replace nulls with 0 on all columns
raw_df.fillna(0).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|      0|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|          0|      0|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          0|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [20]:
# replace nulls with 0 on all numeric columns
# and "Others" on string columns
raw_df.fillna({
    'Store': 'Assume_Store 1',
    'WeekInMonth': 0,
    'Revenue': 0,
}).show()

+--------------+-----------+-------+
|         Store|WeekInMonth|Revenue|
+--------------+-----------+-------+
|       Store 1|          1|    448|
|       Store 1|          2|      0|
|       Store 1|          3|    499|
|       Store 1|         44|    432|
|Assume_Store 1|          0|      0|
|       Store 2|          1|    355|
|       Store 2|          1|    355|
|       Store 2|          0|    345|
|       Store 2|          3|    387|
|       Store 2|          4|    312|
+--------------+-----------+-------+



## Treat Nulls with Imputation Technique

In [21]:
from pyspark.ml.feature import Imputer

In [22]:
correctedType_df = (
    raw_df
    .withColumn('RevenueD', F.col('Revenue').cast('double'))
)
imputer = Imputer(
    inputCols=['RevenueD'],
    outputCols=['RevenueD'],
    strategy='median',
)

In [23]:
(
    imputer
    .fit(correctedType_df.where(F.col('Store') == 'Store 1'))
    .transform(correctedType_df)
).show()

+-------+-----------+-------+--------+
|  Store|WeekInMonth|Revenue|RevenueD|
+-------+-----------+-------+--------+
|Store 1|          1|    448|   448.0|
|Store 1|          2|   null|   448.0|
|Store 1|          3|    499|   499.0|
|Store 1|         44|    432|   432.0|
|   null|       null|   null|   448.0|
|Store 2|          1|    355|   355.0|
|Store 2|          1|    355|   355.0|
|Store 2|       null|    345|   345.0|
|Store 2|          3|    387|   387.0|
|Store 2|          4|    312|   312.0|
+-------+-----------+-------+--------+

