In [0]:
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/employee.csv",
    header=True,
    inferSchema=True,
    quote="'",
    sep="|"
)
df.display()
 

# Handling Missing Records

## Dropping missing Records any column as null

In [0]:
df.na.drop().display()

## Dropping the records specific Column Based With OR Condition

In [0]:
df.na.drop(subset=["id","name"]).display()

## Dropping the records Column Based With AND Condition

In [0]:
from pyspark.sql.functions import col
df.filter(col("name").isNotNull() & col("company").isNotNull()).display()

# Filling Missing Records

### Fill all null Values with default value

In [0]:
df.na.fill("NULL_IN_SOURCE").na.fill(-1).display()

### Fill null value  with differnt Value for differnt Column 

In [0]:
COLUMNS_WITH_DEFAULT = {
    "id": -1, 
    "name": "Check", 
    "exp": 0}
df.na.fill(COLUMNS_WITH_DEFAULT).display()

### Fill null Values with average

In [0]:
from pyspark.sql.functions import avg
avg_exp = df.select(avg("exp")).collect()[0][0]
COLUMNS_WITH_DEFAULT = {
    "id": -1, 
    "name": "Check", 
    "exp": int(avg_exp)}
df.na.fill(COLUMNS_WITH_DEFAULT).display()

### Fill Categorical Data Most Frequent (Mode)

In [0]:
mode_gender = df.groupBy("gen").count().orderBy(col("count").desc()).first()[0]
print(mode_gender)
df.na.fill({
    "id":-1,
    "name":"Anonymous",
    "exp":avg_exp,
    "gen":mode_gender
}).display()

### Replace negative exp or null with average exp

In [0]:
from pyspark.sql.functions import when

avg_exp = df.filter(col("exp") > 0).agg(avg("exp")).collect()[0][0]
print(avg_exp)
df.withColumn(
    "expSecond",
    when(col("exp").isNull() | (col("exp") < 0), avg_exp).otherwise(col("exp")),
).display()