### 1. Generate raw data (semi-structured)

In [1]:
import pyspark.sql.functions as F

In [14]:
raw_df = spark.createDataFrame(
    [
        ('Store 1', 1, 448),
        ('Store 1', 2, None),
        ('Store 1', 3, 499),
        ('Store 1', 44, 432),
        (None, None, None),
        ('Store 2', 1, 355),
        ('Store 2', 1, 355),
        ('Store 2', None, 345),
        ('Store 2', 3, 387),
        ('Store 2', 4, 312),
    ], ['Store', 'WeekInMonth', 'Revenue']
)

In [15]:
raw_df.printSchema()

root
 |-- Store: string (nullable = true)
 |-- WeekInMonth: long (nullable = true)
 |-- Revenue: long (nullable = true)



In [16]:
raw_df.count()

10

In [17]:
raw_df.show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|       null|   null|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



### 2. Find `null`s

In [18]:
raw_df.describe().show()

+-------+-------+------------------+------------------+
|summary|  Store|       WeekInMonth|           Revenue|
+-------+-------+------------------+------------------+
|  count|      9|                 8|                 8|
|   mean|   null|             7.375|           391.625|
| stddev|   null|14.841423689890979|62.741960213469355|
|    min|Store 1|                 1|               312|
|    max|Store 2|                44|               499|
+-------+-------+------------------+------------------+



In [19]:
raw_df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in raw_df.columns]).show()

+-----+-----------+-------+
|Store|WeekInMonth|Revenue|
+-----+-----------+-------+
|    1|          2|      2|
+-----+-----------+-------+



In [20]:
raw_df.select([F.when(F.isnull(c), c).alias(c) for c in raw_df.columns]).show()

+-----+-----------+-------+
|Store|WeekInMonth|Revenue|
+-----+-----------+-------+
| null|       null|   null|
| null|       null|Revenue|
| null|       null|   null|
| null|       null|   null|
|Store|WeekInMonth|Revenue|
| null|       null|   null|
| null|       null|   null|
| null|WeekInMonth|   null|
| null|       null|   null|
| null|       null|   null|
+-----+-----------+-------+



In [21]:
from functools import reduce
raw_df.filter(reduce(lambda a1, a2: a1 | a2, (F.col(c).isNull() for c in raw_df.columns))).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          2|   null|
|   null|       null|   null|
|Store 2|       null|    345|
+-------+-----------+-------+



### 3. Treat `null`s by hiding them

In [22]:
no_null_row_df = raw_df.dropna('all')
no_null_row_df.show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [23]:
raw_df.dropna('any', subset=['Store', 'WeekInMonth']).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|   null|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [24]:
raw_df.dropna('any').show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          3|    499|
|Store 1|         44|    432|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



### 4. Treat `null`s with replacement

In [25]:
raw_df.fillna(0, ['Revenue']).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|      0|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|       null|      0|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|       null|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [26]:
raw_df.fillna(0).show()

+-------+-----------+-------+
|  Store|WeekInMonth|Revenue|
+-------+-----------+-------+
|Store 1|          1|    448|
|Store 1|          2|      0|
|Store 1|          3|    499|
|Store 1|         44|    432|
|   null|          0|      0|
|Store 2|          1|    355|
|Store 2|          1|    355|
|Store 2|          0|    345|
|Store 2|          3|    387|
|Store 2|          4|    312|
+-------+-----------+-------+



In [27]:
raw_df.fillna({'Store': 'Assume_Store 1', 'WeekInMonth': '2', 'Revenue': 3}).show()

+--------------+-----------+-------+
|         Store|WeekInMonth|Revenue|
+--------------+-----------+-------+
|       Store 1|          1|    448|
|       Store 1|          2|      3|
|       Store 1|          3|    499|
|       Store 1|         44|    432|
|Assume_Store 1|          2|      3|
|       Store 2|          1|    355|
|       Store 2|          1|    355|
|       Store 2|          2|    345|
|       Store 2|          3|    387|
|       Store 2|          4|    312|
+--------------+-----------+-------+



### 5. Treat `null`s by imputation

In [28]:
from pyspark.ml.feature import Imputer

In [29]:
corrected_type_df = raw_df.withColumn('RevenueD', F.col('Revenue').cast('double'))

In [31]:
corrected_type_df.show()

+-------+-----------+-------+--------+
|  Store|WeekInMonth|Revenue|RevenueD|
+-------+-----------+-------+--------+
|Store 1|          1|    448|   448.0|
|Store 1|          2|   null|    null|
|Store 1|          3|    499|   499.0|
|Store 1|         44|    432|   432.0|
|   null|       null|   null|    null|
|Store 2|          1|    355|   355.0|
|Store 2|          1|    355|   355.0|
|Store 2|       null|    345|   345.0|
|Store 2|          3|    387|   387.0|
|Store 2|          4|    312|   312.0|
+-------+-----------+-------+--------+



In [30]:
imputer = Imputer(
    inputCols=['RevenueD'], outputCols=['RevenueD'], strategy='median',
)

In [32]:
(
    imputer
    .fit(corrected_type_df.filter(F.col('Store') == 'Store 1'))
    .transform(corrected_type_df)
    .show()
)

+-------+-----------+-------+--------+
|  Store|WeekInMonth|Revenue|RevenueD|
+-------+-----------+-------+--------+
|Store 1|          1|    448|   448.0|
|Store 1|          2|   null|   448.0|
|Store 1|          3|    499|   499.0|
|Store 1|         44|    432|   432.0|
|   null|       null|   null|   448.0|
|Store 2|          1|    355|   355.0|
|Store 2|          1|    355|   355.0|
|Store 2|       null|    345|   345.0|
|Store 2|          3|    387|   387.0|
|Store 2|          4|    312|   312.0|
+-------+-----------+-------+--------+



### END