In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('miss').getOrCreate()

In [5]:
df = spark.read.csv("null.csv",header=True,inferSchema=True)

In [6]:
df.show()

+----+----------+------+
| age|experience|salary|
+----+----------+------+
|  25|         2| 45000|
|  32|         7| 62000|
|NULL|         5| 54000|
|  45|      NULL| 90000|
|  29|         4|  NULL|
|  38|        12| 78000|
|  41|      NULL| 85000|
|NULL|         3| 50000|
|  27|         1| 42000|
|  50|        25|  NULL|
|  36|        10| 70000|
|  31|      NULL| 61000|
|NULL|         6| 56000|
|  42|        15|  NULL|
|  28|         2| 48000|
|  34|         8| 65000|
|  39|      NULL| 80000|
|NULL|         4| 52000|
|  46|        20|  NULL|
|  30|         5| 58000|
+----+----------+------+



In [23]:
##drop the columns
df.drop('age').show()

+----------+------+
|experience|salary|
+----------+------+
|         2| 45000|
|         7| 62000|
|         5| 54000|
|      NULL| 90000|
|         4|  NULL|
|        12| 78000|
|      NULL| 85000|
|         3| 50000|
|         1| 42000|
|        25|  NULL|
|        10| 70000|
|      NULL| 61000|
|         6| 56000|
|        15|  NULL|
|         2| 48000|
|         8| 65000|
|      NULL| 80000|
|         4| 52000|
|        20|  NULL|
|         5| 58000|
+----------+------+



In [9]:
## drop the rows whth the option in drop
df.na.drop(how='any',subset=['age']).show()

+---+----------+------+
|age|experience|salary|
+---+----------+------+
| 25|         2| 45000|
| 32|         7| 62000|
| 45|      NULL| 90000|
| 29|         4|  NULL|
| 38|        12| 78000|
| 41|      NULL| 85000|
| 27|         1| 42000|
| 50|        25|  NULL|
| 36|        10| 70000|
| 31|      NULL| 61000|
| 42|        15|  NULL|
| 28|         2| 48000|
| 34|         8| 65000|
| 39|      NULL| 80000|
| 46|        20|  NULL|
| 30|         5| 58000|
+---+----------+------+



In [12]:
### Filling the Missing Value
df.na.fill(0,subset=['salary']).show()

+----+----------+------+
| age|experience|salary|
+----+----------+------+
|  25|         2| 45000|
|  32|         7| 62000|
|NULL|         5| 54000|
|  45|      NULL| 90000|
|  29|         4|     0|
|  38|        12| 78000|
|  41|      NULL| 85000|
|NULL|         3| 50000|
|  27|         1| 42000|
|  50|        25|     0|
|  36|        10| 70000|
|  31|      NULL| 61000|
|NULL|         6| 56000|
|  42|        15|     0|
|  28|         2| 48000|
|  34|         8| 65000|
|  39|      NULL| 80000|
|NULL|         4| 52000|
|  46|        20|     0|
|  30|         5| 58000|
+----+----------+------+



In [13]:
from pyspark.ml.feature import Imputer

In [16]:
imputer = Imputer(
    inputCols= ['age','experience','salary'],
    outputCols= [f"{c}_imputed" for c in ['age','experience','salary']]
).setStrategy("median")

In [20]:
# Add imputation cols to df
df_imputed = imputer.fit(df).transform(df)

In [21]:
df_imputed.show()

+----+----------+------+-----------+------------------+--------------+
| age|experience|salary|age_imputed|experience_imputed|salary_imputed|
+----+----------+------+-----------+------------------+--------------+
|  25|         2| 45000|         25|                 2|         45000|
|  32|         7| 62000|         32|                 7|         62000|
|NULL|         5| 54000|         34|                 5|         54000|
|  45|      NULL| 90000|         45|                 5|         90000|
|  29|         4|  NULL|         29|                 4|         58000|
|  38|        12| 78000|         38|                12|         78000|
|  41|      NULL| 85000|         41|                 5|         85000|
|NULL|         3| 50000|         34|                 3|         50000|
|  27|         1| 42000|         27|                 1|         42000|
|  50|        25|  NULL|         50|                25|         58000|
|  36|        10| 70000|         36|                10|         70000|
|  31|

In [22]:
### sum of nulls in imputed rows

from pyspark.sql.functions import col, sum

df_imputed.select(
    sum(col("age_imputed").isNull().cast("int")).alias("age_imputed_nulls"),
    sum(col("experience_imputed").isNull().cast("int")).alias("experience_imputed_nulls"),
    sum(col("salary_imputed").isNull().cast("int")).alias("salary_imputed_nulls"),
).show()


+-----------------+------------------------+--------------------+
|age_imputed_nulls|experience_imputed_nulls|salary_imputed_nulls|
+-----------------+------------------------+--------------------+
|                0|                       0|                   0|
+-----------------+------------------------+--------------------+

