In [0]:
df = spark.read.csv(
    "/databricks-datasets/flights/departuredelays.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()


+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 5 rows
root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [0]:
from pyspark.sql.functions import (
    lower, upper, trim, ltrim, rtrim, length, substring,
    split, concat, concat_ws, regexp_replace, regexp_extract, translate, col
)


In [0]:
df_lower = df.select(col("origin"), lower(col("origin")).alias("origin_lower"))
df_lower.show(5)



+------+------------+
|origin|origin_lower|
+------+------------+
|   ABE|         abe|
|   ABE|         abe|
|   ABE|         abe|
|   ABE|         abe|
|   ABE|         abe|
+------+------------+
only showing top 5 rows


In [0]:

df_upper = df.select(col("destination"), upper(col("destination")).alias("dest_upper"))
df_upper.show(5)

+-----------+----------+
|destination|dest_upper|
+-----------+----------+
|        ATL|       ATL|
|        DTW|       DTW|
|        ATL|       ATL|
|        ATL|       ATL|
|        ATL|       ATL|
+-----------+----------+
only showing top 5 rows


In [0]:
df_trim = df.select(
    trim(col("origin")).alias("trimmed"),
    ltrim(col("origin")).alias("ltrimmed"),
    rtrim(col("origin")).alias("rtrimmed")
)
df_trim.show(5)


+-------+--------+--------+
|trimmed|ltrimmed|rtrimmed|
+-------+--------+--------+
|    ABE|     ABE|     ABE|
|    ABE|     ABE|     ABE|
|    ABE|     ABE|     ABE|
|    ABE|     ABE|     ABE|
|    ABE|     ABE|     ABE|
+-------+--------+--------+
only showing top 5 rows


In [0]:
df_length = df.select(col("origin"), length(col("origin")).alias("origin_length"))
df_length.show(5)


+------+-------------+
|origin|origin_length|
+------+-------------+
|   ABE|            3|
|   ABE|            3|
|   ABE|            3|
|   ABE|            3|
|   ABE|            3|
+------+-------------+
only showing top 5 rows


In [0]:
df_sub = df.select(
    col("date"),
    substring(col("date"), 1, 4).alias("year"),
    substring(col("date"), 5, 2).alias("month")
)
df_sub.show(5)


+-------+----+-----+
|   date|year|month|
+-------+----+-----+
|1011245|1011|   24|
|1020600|1020|   60|
|1021245|1021|   24|
|1020605|1020|   60|
|1031245|1031|   24|
+-------+----+-----+
only showing top 5 rows


In [0]:
df_split = df.select(
    col("date"),
    split(col("date"), "").alias("split_chars")     # splits into characters
)
df_split.show(5)


+-------+--------------------+
|   date|         split_chars|
+-------+--------------------+
|1011245|[1, 0, 1, 1, 2, 4...|
|1020600|[1, 0, 2, 0, 6, 0...|
|1021245|[1, 0, 2, 1, 2, 4...|
|1020605|[1, 0, 2, 0, 6, 0...|
|1031245|[1, 0, 3, 1, 2, 4...|
+-------+--------------------+
only showing top 5 rows


In [0]:
df_concat = df.select(
    concat(col("origin"), col("destination")).alias("route")
)
df_concat.show(5)


+------+
| route|
+------+
|ABEATL|
|ABEDTW|
|ABEATL|
|ABEATL|
|ABEATL|
+------+
only showing top 5 rows


In [0]:
df_concat_ws = df.select(
    concat_ws(" -> ", col("origin"), col("destination")).alias("route_pretty")
)
df_concat_ws.show(5)


+------------+
|route_pretty|
+------------+
|  ABE -> ATL|
|  ABE -> DTW|
|  ABE -> ATL|
|  ABE -> ATL|
|  ABE -> ATL|
+------------+
only showing top 5 rows


In [0]:
df_reg_replace = df.select(
    col("delay"),
    regexp_replace(col("delay"), "^0+", "").alias("clean_delay")
)
df_reg_replace.show(5)


+-----+-----------+
|delay|clean_delay|
+-----+-----------+
|    6|          6|
|   -8|         -8|
|   -2|         -2|
|   -4|         -4|
|   -4|         -4|
+-----+-----------+
only showing top 5 rows


In [0]:
df_reg_replace = df.select(
    col("delay"),
    regexp_replace(col("delay"), "^0+", "").alias("clean_delay")
)
df_reg_replace.show(5)


+-----+-----------+
|delay|clean_delay|
+-----+-----------+
|    6|          6|
|   -8|         -8|
|   -2|         -2|
|   -4|         -4|
|   -4|         -4|
+-----+-----------+
only showing top 5 rows


In [0]:
df_translate = df.select(
    col("origin"),
    translate(col("origin"), "AEO", "123").alias("translated")
)
df_translate.show(5)


+------+----------+
|origin|translated|
+------+----------+
|   ABE|       1B2|
|   ABE|       1B2|
|   ABE|       1B2|
|   ABE|       1B2|
|   ABE|       1B2|
+------+----------+
only showing top 5 rows
