In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Column Operations").getOrCreate()

In [0]:
df = spark.read.csv(
    "/databricks-datasets/flights/departuredelays.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()


+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 5 rows
root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [0]:
df.select("date", "delay", "origin").show(5)


+-------+-----+------+
|   date|delay|origin|
+-------+-----+------+
|1011245|    6|   ABE|
|1020600|   -8|   ABE|
|1021245|   -2|   ABE|
|1020605|   -4|   ABE|
|1031245|   -4|   ABE|
+-------+-----+------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import col

df.select(col("delay") + 10, col("origin")).show(5)


+------------+------+
|(delay + 10)|origin|
+------------+------+
|          16|   ABE|
|           2|   ABE|
|           8|   ABE|
|           6|   ABE|
|           6|   ABE|
+------------+------+
only showing top 5 rows


In [0]:
df.selectExpr("date", "delay", "distance * 1.609 as distance_km").show(5)


+-------+-----+-----------+
|   date|delay|distance_km|
+-------+-----+-----------+
|1011245|    6|    968.618|
|1020600|   -8|    593.721|
|1021245|   -2|    968.618|
|1020605|   -4|    968.618|
|1031245|   -4|    968.618|
+-------+-----+-----------+
only showing top 5 rows


In [0]:
df2 = df.withColumn("delay_in_hours", col("delay") / 60)
df2.show(5)


+-------+-----+--------+------+-----------+--------------------+
|   date|delay|distance|origin|destination|      delay_in_hours|
+-------+-----+--------+------+-----------+--------------------+
|1011245|    6|     602|   ABE|        ATL|                 0.1|
|1020600|   -8|     369|   ABE|        DTW|-0.13333333333333333|
|1021245|   -2|     602|   ABE|        ATL|-0.03333333333333333|
|1020605|   -4|     602|   ABE|        ATL|-0.06666666666666667|
|1031245|   -4|     602|   ABE|        ATL|-0.06666666666666667|
+-------+-----+--------+------+-----------+--------------------+
only showing top 5 rows


In [0]:
df3 = df.withColumn("distance", col("distance") * 1.609)
df3.show(5)


+-------+-----+-----------------+------+-----------+
|   date|delay|         distance|origin|destination|
+-------+-----+-----------------+------+-----------+
|1011245|    6|968.6179999999999|   ABE|        ATL|
|1020600|   -8|          593.721|   ABE|        DTW|
|1021245|   -2|968.6179999999999|   ABE|        ATL|
|1020605|   -4|968.6179999999999|   ABE|        ATL|
|1031245|   -4|968.6179999999999|   ABE|        ATL|
+-------+-----+-----------------+------+-----------+
only showing top 5 rows


In [0]:
df_renamed = df.withColumnRenamed("delay", "delay_minutes")
df_renamed.show(5)


+-------+-------------+--------+------+-----------+
|   date|delay_minutes|distance|origin|destination|
+-------+-------------+--------+------+-----------+
|1011245|            6|     602|   ABE|        ATL|
|1020600|           -8|     369|   ABE|        DTW|
|1021245|           -2|     602|   ABE|        ATL|
|1020605|           -4|     602|   ABE|        ATL|
|1031245|           -4|     602|   ABE|        ATL|
+-------+-------------+--------+------+-----------+
only showing top 5 rows


In [0]:
df_drop = df.drop("origin")
df_drop.show(5)


+-------+-----+--------+-----------+
|   date|delay|distance|destination|
+-------+-----+--------+-----------+
|1011245|    6|     602|        ATL|
|1020600|   -8|     369|        DTW|
|1021245|   -2|     602|        ATL|
|1020605|   -4|     602|        ATL|
|1031245|   -4|     602|        ATL|
+-------+-----+--------+-----------+
only showing top 5 rows


In [0]:
df_no_dup = df.dropDuplicates(["origin", "destination"])
df_no_dup.show(5)


+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011835|  120|     532|   BNA|        TPA|
|1020655|  202|     194|   FAR|        MSP|
|1011150|   -3|     250|   BOI|        GEG|
|1012030|   -5|     700|   FLL|        RIC|
|1011915|   -3|     217|   ATL|        VPS|
+-------+-----+--------+------+-----------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import lit

df_lit = df.withColumn("airline", lit("Unknown Airline"))
df_lit.show(5)


+-------+-----+--------+------+-----------+---------------+
|   date|delay|distance|origin|destination|        airline|
+-------+-----+--------+------+-----------+---------------+
|1011245|    6|     602|   ABE|        ATL|Unknown Airline|
|1020600|   -8|     369|   ABE|        DTW|Unknown Airline|
|1021245|   -2|     602|   ABE|        ATL|Unknown Airline|
|1020605|   -4|     602|   ABE|        ATL|Unknown Airline|
|1031245|   -4|     602|   ABE|        ATL|Unknown Airline|
+-------+-----+--------+------+-----------+---------------+
only showing top 5 rows


In [0]:
df_col = df.select(col("origin"), col("delay") * 2)
df_col.show(5)


+------+-----------+
|origin|(delay * 2)|
+------+-----------+
|   ABE|         12|
|   ABE|        -16|
|   ABE|         -4|
|   ABE|         -8|
|   ABE|         -8|
+------+-----------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import expr

df_expr = df.withColumn("delay_category", expr(
    "CASE WHEN delay > 60 THEN 'High Delay' ELSE 'Low Delay' END"
))
df_expr.show(5)


+-------+-----+--------+------+-----------+--------------+
|   date|delay|distance|origin|destination|delay_category|
+-------+-----+--------+------+-----------+--------------+
|1011245|    6|     602|   ABE|        ATL|     Low Delay|
|1020600|   -8|     369|   ABE|        DTW|     Low Delay|
|1021245|   -2|     602|   ABE|        ATL|     Low Delay|
|1020605|   -4|     602|   ABE|        ATL|     Low Delay|
|1031245|   -4|     602|   ABE|        ATL|     Low Delay|
+-------+-----+--------+------+-----------+--------------+
only showing top 5 rows
