In [0]:
from pyspark.sql import Row

data = [
    Row(id=1, name="John",   age=28,  city="New York"),
    Row(id=2, name=None,     age=35,  city="Chicago"),
    Row(id=3, name="Alice",  age=None, city=None),
    Row(id=4, name=None,     age=None, city="Houston"),
    Row(id=5, name="Bob",    age=30,  city=None)
]

df = spark.createDataFrame(data)
df.show()
df.printSchema()


+---+-----+----+--------+
| id| name| age|    city|
+---+-----+----+--------+
|  1| John|  28|New York|
|  2| NULL|  35| Chicago|
|  3|Alice|NULL|    NULL|
|  4| NULL|NULL| Houston|
|  5|  Bob|  30|    NULL|
+---+-----+----+--------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)



In [0]:
df.filter(df.name.isNull()).show()


+---+----+----+-------+
| id|name| age|   city|
+---+----+----+-------+
|  2|NULL|  35|Chicago|
|  4|NULL|NULL|Houston|
+---+----+----+-------+



In [0]:
df.filter(df.age.isNotNull()).show()


+---+----+---+--------+
| id|name|age|    city|
+---+----+---+--------+
|  1|John| 28|New York|
|  2|NULL| 35| Chicago|
|  5| Bob| 30|    NULL|
+---+----+---+--------+



In [0]:
df_fill = df.fillna({
    "name": "Unknown",
    "age": 0,
    "city": "Not Available"
})
df_fill.show()


+---+-------+---+-------------+
| id|   name|age|         city|
+---+-------+---+-------------+
|  1|   John| 28|     New York|
|  2|Unknown| 35|      Chicago|
|  3|  Alice|  0|Not Available|
|  4|Unknown|  0|      Houston|
|  5|    Bob| 30|Not Available|
+---+-------+---+-------------+



In [0]:
df.fillna(0).show()


+---+-----+---+--------+
| id| name|age|    city|
+---+-----+---+--------+
|  1| John| 28|New York|
|  2| NULL| 35| Chicago|
|  3|Alice|  0|    NULL|
|  4| NULL|  0| Houston|
|  5|  Bob| 30|    NULL|
+---+-----+---+--------+



In [0]:
df.fillna("Missing", subset=["name", "city"]).show()


+---+-------+----+--------+
| id|   name| age|    city|
+---+-------+----+--------+
|  1|   John|  28|New York|
|  2|Missing|  35| Chicago|
|  3|  Alice|NULL| Missing|
|  4|Missing|NULL| Houston|
|  5|    Bob|  30| Missing|
+---+-------+----+--------+



In [0]:
df.dropna().show()


+---+----+---+--------+
| id|name|age|    city|
+---+----+---+--------+
|  1|John| 28|New York|
+---+----+---+--------+



In [0]:
df.dropna(how="all").show()


+---+-----+----+--------+
| id| name| age|    city|
+---+-----+----+--------+
|  1| John|  28|New York|
|  2| NULL|  35| Chicago|
|  3|Alice|NULL|    NULL|
|  4| NULL|NULL| Houston|
|  5|  Bob|  30|    NULL|
+---+-----+----+--------+



In [0]:
df.dropna(thresh=2).show()


+---+-----+----+--------+
| id| name| age|    city|
+---+-----+----+--------+
|  1| John|  28|New York|
|  2| NULL|  35| Chicago|
|  3|Alice|NULL|    NULL|
|  4| NULL|NULL| Houston|
|  5|  Bob|  30|    NULL|
+---+-----+----+--------+



In [0]:
df_replace = df.na.replace("John", "Jonathan", subset=["name"])
df_replace.show()


+---+--------+----+--------+
| id|    name| age|    city|
+---+--------+----+--------+
|  1|Jonathan|  28|New York|
|  2|    NULL|  35| Chicago|
|  3|   Alice|NULL|    NULL|
|  4|    NULL|NULL| Houston|
|  5|     Bob|  30|    NULL|
+---+--------+----+--------+



In [0]:
df_replace2 = df.na.replace(
    ["Chicago", "Houston"], 
    ["CHI", "HOU"],
    subset=["city"]
)
df_replace2.show()


+---+-----+----+--------+
| id| name| age|    city|
+---+-----+----+--------+
|  1| John|  28|New York|
|  2| NULL|  35|     CHI|
|  3|Alice|NULL|    NULL|
|  4| NULL|NULL|     HOU|
|  5|  Bob|  30|    NULL|
+---+-----+----+--------+



In [0]:
df = (spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("/databricks-datasets/airlines/part-00000"))

df.show(5)
df.printSchema()


+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|   10|    

In [0]:
from pyspark.sql.functions import *


In [0]:
df2 = df.withColumn(
    "flight_date",
    to_date(expr("concat(Year,'-',Month,'-',DayofMonth)"), "yyyy-M-d")
)
df2.select("Year","Month","DayofMonth","flight_date").show(5)


+----+-----+----------+-----------+
|Year|Month|DayofMonth|flight_date|
+----+-----+----------+-----------+
|1987|   10|        14| 1987-10-14|
|1987|   10|        15| 1987-10-15|
|1987|   10|        17| 1987-10-17|
|1987|   10|        18| 1987-10-18|
|1987|   10|        19| 1987-10-19|
+----+-----+----------+-----------+
only showing top 5 rows


In [0]:
df2.select(
    current_date().alias("today"),
    current_timestamp().alias("current_time")
).show(1)


+----------+--------------------+
|     today|        current_time|
+----------+--------------------+
|2025-12-12|2025-12-12 06:02:...|
+----------+--------------------+
only showing top 1 row


In [0]:
df2.select(
    "flight_date",
    date_format("flight_date", "MMM dd, yyyy").alias("formatted_date")
).show(5)


+-----------+--------------+
|flight_date|formatted_date|
+-----------+--------------+
| 1987-10-14|  Oct 14, 1987|
| 1987-10-15|  Oct 15, 1987|
| 1987-10-17|  Oct 17, 1987|
| 1987-10-18|  Oct 18, 1987|
| 1987-10-19|  Oct 19, 1987|
+-----------+--------------+
only showing top 5 rows
