In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

data = [
    (1, " Bhushan  ", "ACTIVE", 25, 50000, "2025-01-10"),
    (2, "  Ashish  ", "INACTIVE", 30, 60000, "2025-02-10"),
    (3, "  Aniket  ", "ACTIVE", 35, 70000, "2025-03-10"),
    (4, "John", "ACTIVE", 40, 80000, "2025-03-10"),
    (5, "  Priya  ", "INACTIVE", 45, 90000, "2025-06-10"),
    (6, "Vedashri", "ACTIVE", 50, 100000, "2025-08-10")
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", IntegerType(), True),
    StructField("date", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df = df.withColumn("date", to_date("date", "yyyy-MM-dd"))
df.show()

Concept 1: Immutability
ðŸ‘‰ DataFrames are immutable


In [0]:
def trim_name(df):
    return df.withColumn("name", trim(col("name")))

Concept 2: withColumn = Column Transformation

In [0]:
def add_salary_bonus(df):
    return df.withColumn("salary_with_bonus", col("salary") * 1.10)

Concept 3: NULL Handling

In [0]:
def handle_null_salary(df):
    return df.withColumn(
        "salary",
        when(col("salary").isNull() | (col("salary") < 0), 0)
        .otherwise(col("salary"))
    )

Concept 4: Filter vs Where

In [0]:
def filter_active_records(df):
    return df.filter(col("status") == "ACTIVE")

de = filter_active_records(df)
de.display()

Concept 5: Derived Columns

In [0]:
def derive_age_group(df):
    return df.withColumn(
        "age_group",
        when(col("age") < 30, "YOUNG")
        .when(col("age") <= 40, "MID")
        .otherwise("Senior")
    )

fd = derive_age_group(df)
fd.display()