In [1]:
from pyspark.sql import SparkSession
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-22"

In [2]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [29]:
df_spark = spark.read.csv('nulls_handling.csv',header = True, inferSchema=True)

In [15]:
df_spark.show()

+--------+----+----------+
|    Name| Age|Experience|
+--------+----+----------+
|    John|  25|         7|
|   Emily|  30|      NULL|
| Michael|  28|         9|
|   Sarah|NULL|      NULL|
|   David|  40|         8|
| Jessica|NULL|         5|
|  Daniel|  33|      NULL|
|  Rachel|  29|         6|
| Matthew|NULL|         4|
|Jennifer|  31|        10|
+--------+----+----------+



In [6]:
## drop columns
df_spark.drop('name').show()

+----+----------+
| Age|Experience|
+----+----------+
|  25|         7|
|  30|      NULL|
|  28|         9|
|NULL|         2|
|  40|         8|
|NULL|         5|
|  33|      NULL|
|  29|         6|
|NULL|         4|
|  31|        10|
+----+----------+



In [8]:
#drop all nulls
df_spark.na.drop().show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
| Michael| 28|         9|
|   David| 40|         8|
|  Rachel| 29|         6|
|Jennifer| 31|        10|
+--------+---+----------+



In [9]:
## all==how, drop all null rows
df_spark.na.drop(how="all").show()

+--------+----+----------+
|    Name| Age|Experience|
+--------+----+----------+
|    John|  25|         7|
|   Emily|  30|      NULL|
| Michael|  28|         9|
|   Sarah|NULL|         2|
|   David|  40|         8|
| Jessica|NULL|         5|
|  Daniel|  33|      NULL|
|  Rachel|  29|         6|
| Matthew|NULL|         4|
|Jennifer|  31|        10|
+--------+----+----------+



In [20]:
#threshold
df_spark.na.drop(how="all",thresh=2).show()

+--------+----+----------+
|    Name| Age|Experience|
+--------+----+----------+
|    John|  25|         7|
|   Emily|  30|      NULL|
| Michael|  28|         9|
|   David|  40|         8|
| Jessica|NULL|         5|
|  Daniel|  33|      NULL|
|  Rachel|  29|         6|
| Matthew|NULL|         4|
|Jennifer|  31|        10|
+--------+----+----------+



In [22]:
#Subset
df_spark.na.drop(how="all",subset=['Age']).show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|      NULL|
| Michael| 28|         9|
|   David| 40|         8|
|  Daniel| 33|      NULL|
|  Rachel| 29|         6|
|Jennifer| 31|        10|
+--------+---+----------+



In [25]:
### Filling the missing values
df_spark.na.fill(1).show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         1|
| Michael| 28|         9|
|   Sarah|  1|         1|
|   David| 40|         8|
| Jessica|  1|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew|  1|         4|
|Jennifer| 31|        10|
+--------+---+----------+



In [26]:
### Filling the missing values
df_spark.na.fill(1,['Experience']).show()

+--------+----+----------+
|    Name| Age|Experience|
+--------+----+----------+
|    John|  25|         7|
|   Emily|  30|         1|
| Michael|  28|         9|
|   Sarah|NULL|         1|
|   David|  40|         8|
| Jessica|NULL|         5|
|  Daniel|  33|         1|
|  Rachel|  29|         6|
| Matthew|NULL|         4|
|Jennifer|  31|        10|
+--------+----+----------+



In [27]:
df_spark.show()

+--------+----+----------+
|    Name| Age|Experience|
+--------+----+----------+
|    John|  25|         7|
|   Emily|  30|      NULL|
| Michael|  28|         9|
|   Sarah|NULL|      NULL|
|   David|  40|         8|
| Jessica|NULL|         5|
|  Daniel|  33|      NULL|
|  Rachel|  29|         6|
| Matthew|NULL|         4|
|Jennifer|  31|        10|
+--------+----+----------+



In [32]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [28]:
#replace nulls with mean 
from pyspark.ml.feature import Imputer


In [None]:
# Specify input columns and output columns separately for each imputation
input_cols = ['Age', 'Experience', 'Salary']
output_cols = ["{}_imputed".format(c) for c in input_cols]

# Create separate Imputer instances for each column
imputers = [
    Imputer(inputCol=col, outputCol=output_col).setStrategy("mean")
    for col, output_col in zip(input_cols, output_cols)
]

In [39]:
from pyspark.ml import Pipeline

# Assuming df is your DataFrame
pipeline = Pipeline(stages=imputers)
model = pipeline.fit(df_spark)
df_imputed = model.transform(df_spark)


In [41]:
df_imputed.show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|    John|  25|         7| 20000|         25|                 7|         20000|
|   Emily|  30|      NULL| 15000|         30|                 7|         15000|
| Michael|  28|         9|  8000|         28|                 9|          8000|
|   Sarah|NULL|      NULL| 14000|         30|                 7|         14000|
|   David|  40|         8| 25000|         40|                 8|         25000|
| Jessica|NULL|         5|  NULL|         30|                 5|         14625|
|  Daniel|  33|      NULL|  5000|         33|                 7|          5000|
|  Rachel|  29|         6| 18000|         29|                 6|         18000|
| Matthew|NULL|         4|  NULL|         30|                 4|         14625|
|Jennifer|  31|        10| 12000|       

In [42]:
#median
# Specify input columns and output columns separately for each imputation
input_cols = ['Age', 'Experience', 'Salary']
output_cols = ["{}_imputed".format(c) for c in input_cols]

# Create separate Imputer instances for each column
imputers = [
    Imputer(inputCol=col, outputCol=output_col).setStrategy("median")
    for col, output_col in zip(input_cols, output_cols)
]
pipeline = Pipeline(stages=imputers)
model = pipeline.fit(df_spark)
df_imputed = model.transform(df_spark)
df_imputed.show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|    John|  25|         7| 20000|         25|                 7|         20000|
|   Emily|  30|      NULL| 15000|         30|                 7|         15000|
| Michael|  28|         9|  8000|         28|                 9|          8000|
|   Sarah|NULL|      NULL| 14000|         30|                 7|         14000|
|   David|  40|         8| 25000|         40|                 8|         25000|
| Jessica|NULL|         5|  NULL|         30|                 5|         14000|
|  Daniel|  33|      NULL|  5000|         33|                 7|          5000|
|  Rachel|  29|         6| 18000|         29|                 6|         18000|
| Matthew|NULL|         4|  NULL|         30|                 4|         14000|
|Jennifer|  31|        10| 12000|       