In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, monotonically_increasing_id
from pyspark.sql.types import IntegerType

app_name = "MiscTransformations"
spark: SparkSession = (
    SparkSession.builder.master("local[3]")
    .appName(app_name)
    .getOrCreate()
)

25/04/07 18:30:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.version

'3.5.5'

# Quick method to create dataframe

- mainly for testing, exploring some techniques
- skipped parallizing the data, creating RDD, creating schema definition etc.

In [3]:
dataList = [
    ("Ravi", 28, 1, 2002),
    ("Abdul", 23, 5, 81),
    ("John", 12, 12, 6),
    ("Rosy", 7, 8, 63),
    ("Abdul", 23, 5, 81),
]

rawDF = spark.createDataFrame(dataList)
rawDF.show()
rawDF.printSchema()  # can use namedtuple to create schema

                                                                                

+-----+---+---+----+
|   _1| _2| _3|  _4|
+-----+---+---+----+
| Ravi| 28|  1|2002|
|Abdul| 23|  5|  81|
| John| 12| 12|   6|
| Rosy|  7|  8|  63|
|Abdul| 23|  5|  81|
+-----+---+---+----+

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: long (nullable = true)



# Quick way to attach column names

- `toDF()`: _returns a new DataFrame that with new specified column names_

In [4]:
rawDFCol = spark.createDataFrame(dataList).toDF("name", "day", "month", "year")
rawDFCol.show()
rawDFCol.printSchema()

+-----+---+-----+----+
| name|day|month|year|
+-----+---+-----+----+
| Ravi| 28|    1|2002|
|Abdul| 23|    5|  81|
| John| 12|   12|   6|
| Rosy|  7|    8|  63|
|Abdul| 23|    5|  81|
+-----+---+-----+----+

root
 |-- name: string (nullable = true)
 |-- day: long (nullable = true)
 |-- month: long (nullable = true)
 |-- year: long (nullable = true)



# Some problems

- changed to string to show the problem

In [5]:
dataList = [
    ("Ravi", "28", "1", "2002"),
    ("Abdul", "23", "5", "81"),
    ("John", "12", "12", "6"),
    ("Rosy", "7", "8", "63"),
    ("Abdul", "23", "5", "81"),
]

rawDF = spark.createDataFrame(dataList).toDF("name", "day", "month", "year")
rawDF.show()
rawDF.printSchema()

+-----+---+-----+----+
| name|day|month|year|
+-----+---+-----+----+
| Ravi| 28|    1|2002|
|Abdul| 23|    5|  81|
| John| 12|   12|   6|
| Rosy|  7|    8|  63|
|Abdul| 23|    5|  81|
+-----+---+-----+----+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)



# How to add monotonically increasing id

- `from pyspark.sql.functions import monotonically_increasing_id`
- generates monotonically increasing 64-bit integers that are guaranteed to be unique and increasing, but not consecutive, within a partition.
- The ID starts at 0.

In [6]:
rawDF = (
    spark.createDataFrame(dataList)
    .toDF(
        "name",
        "day",
        "month",
        "year",
    )
    .repartition(3)  # in local, to sense real behvaior, remove in production, for repartition(1) id is consecutive
)

rawDF.show()

df1 = rawDF.withColumn("id", monotonically_increasing_id())
# withColumn(colName: str, col: Column) -> "DataFrame"
# Returns a new `DataFrame` by adding a column or replacing the
# existing column that has the same name.
df1.show()
df1.printSchema()

+-----+---+-----+----+
| name|day|month|year|
+-----+---+-----+----+
| Ravi| 28|    1|2002|
|Abdul| 23|    5|  81|
|Abdul| 23|    5|  81|
| John| 12|   12|   6|
| Rosy|  7|    8|  63|
+-----+---+-----+----+

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|  81|          1|
|Abdul| 23|    5|  81| 8589934592|
| John| 12|   12|   6|17179869184|
| Rosy|  7|    8|  63|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- id: long (nullable = false)



# How to use CASE, WHEN, THEN

- avoid lengthy if-else statements
- _lte's fix year digit problem using it_

In [7]:
dataList = [
    ("Ravi", "28", "1", "2002"),
    ("Abdul", "23", "5", "81"),  # 1981
    ("John", "12", "12", "6"),  # 2006
    ("Rosy", "7", "8", "63"),  # 1963
    ("Abdul", "23", "5", "81"),  # 1981
]

df2 = df1.withColumn(
    "year",
    expr("""
    CASE
        WHEN year < 25 THEN year + 2000
        WHEN year < 100 THEN year + 1900
        ELSE
            year
    END
    """),
)

df2.show()
df2.printSchema()

+-----+---+-----+------+-----------+
| name|day|month|  year|         id|
+-----+---+-----+------+-----------+
| Ravi| 28|    1|  2002|          0|
|Abdul| 23|    5|1981.0|          1|
|Abdul| 23|    5|1981.0| 8589934592|
| John| 12|   12|2006.0|17179869184|
| Rosy|  7|    8|1963.0|17179869185|
+-----+---+-----+------+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- id: long (nullable = false)



- year is string, but operations were done considering number with decimal. So, year became decimal.
- **_REASON:_** incorrect datatype and automatic type promotion. year field in the dataframe is a string. But we performed an arithmetic operation on it, so spark SQL Engine automatically promoted it to decimal. After that again, it's demoted to string because the dataframe schema is for a string field.
- **_FIX:_** to cast fields.

# How to cast fields?

Two common approaches
- **_Inline cast_**
- **_Change the Schema_**

# Inline Cast

In [8]:
df3 = df1.withColumn(
    "year",
    expr(
        """
        CASE
            WHEN year < 25 THEN CAST(year AS INT) + 2000
            WHEN year < 100 THEN CAST(year AS INT) + 1900
            ELSE
                YEAR
        END
        """
    ),
)
df3.show()
df3.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981|          1|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006|17179869184|
| Rosy|  7|    8|1963|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- id: long (nullable = false)



# Change the Schema

In [9]:
df4 = df1.withColumn(
    "year",
    expr(
        """
        CASE
            WHEN year < 25 THEN year + 2000
            WHEN year < 100 THEN year + 1900
            ELSE
                YEAR
        END
        """
    ).cast(IntegerType()),
)

# df4 = df4.withColumn("year", df4["year"].cast("int"))  # this works too
df4.show()
df4.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981|          1|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006|17179869184|
| Rosy|  7|    8|1963|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



# The right way

Should have fixed the data types in the beginning and avoided casting.

In [10]:
df1.show()
df1.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|  81|          1|
|Abdul| 23|    5|  81| 8589934592|
| John| 12|   12|   6|17179869184|
| Rosy|  7|    8|  63|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- id: long (nullable = false)



In [11]:
from pyspark.sql.functions import col

df5 = (
    df1.withColumn("day", col("day").cast(IntegerType()))
    .withColumn("month", col("month").cast(IntegerType()))
    .withColumn("year", col("year").cast(IntegerType()))
)

df6 = df5.withColumn(
    "year",
    expr("""
        CASE
            WHEN year < 25 THEN year + 2000
            WHEN year < 100 THEN year + 1900
            ELSE
                year
        END
        """),
)
df6.show()
df6.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981|          1|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006|17179869184|
| Rosy|  7|    8|1963|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



> Remember, incorrect types can give some unexpected results. Explicit casting is always a good option

# Alternative Method to CASE expression

- SQL like expression is more convinient and can be preferred.
- we learnt to build our expressions using columns, functions.

Let's see column object expression for the CASE expression using same `withColumn()` and work with the same `year` field.

In [12]:
df5.show()
df5.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|  81|          1|
|Abdul| 23|    5|  81| 8589934592|
| John| 12|   12|   6|17179869184|
| Rosy|  7|    8|  63|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



In [13]:
from pyspark.sql.functions import when


df7 = df5.withColumn(
    "year",
    when(col("year") < 25, col("year") + 2000)
    .when(col("year") < 100, col("year") + 1900)
    .otherwise(col("year")),
)

df7.show()
df7.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981|          1|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006|17179869184|
| Rosy|  7|    8|1963|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



# How to Add/Remove columns and duplicates?

- we already added column previously using `monotonically_increasing_id()`
- let's add one more column `dob` combining `date`, `month`, `year` columns.

In [14]:
df7.show()
df7.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981|          1|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006|17179869184|
| Rosy|  7|    8|1963|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



In [15]:
df8 = df7.withColumn(
    "dob",
    expr("to_date(CONCAT(day, '/', month, '/', year), 'd/M/y')"),
)
df8.show()
df8.printSchema()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Ravi| 28|    1|2002|          0|2002-01-28|
|Abdul| 23|    5|1981|          1|1981-05-23|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| John| 12|   12|2006|17179869184|2006-12-12|
| Rosy|  7|    8|1963|17179869185|1963-08-07|
+-----+---+-----+----+-----------+----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)
 |-- dob: date (nullable = true)



**OR**

- `to_date` from `pyspark.sql.functions`: _outside `expr`_

In [16]:
from pyspark.sql.functions import to_date


df9 = df7.withColumn(
    "dob",
    to_date(expr("CONCAT(day, '/', month, '/', year)"), "d/M/y"),
)
df9.show()
df9.printSchema()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Ravi| 28|    1|2002|          0|2002-01-28|
|Abdul| 23|    5|1981|          1|1981-05-23|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| John| 12|   12|2006|17179869184|2006-12-12|
| Rosy|  7|    8|1963|17179869185|1963-08-07|
+-----+---+-----+----+-----------+----------+

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)
 |-- dob: date (nullable = true)



In [17]:
df10 = (
    df7.withColumn(
        "dob",
        to_date(expr("CONCAT(day, '/', month, '/', year)"), "d/M/y"),
    )
    .drop("day", "month", "year")
    .dropDuplicates(["name", "dob"])
    # .sort(expr("dob DESC"))  # will give ascending order
    .sort("dob", ascending=False)
)

df10.show()
df10.printSchema()

+-----+-----------+----------+
| name|         id|       dob|
+-----+-----------+----------+
| John|17179869184|2006-12-12|
| Ravi|          0|2002-01-28|
|Abdul|          1|1981-05-23|
| Rosy|17179869185|1963-08-07|
+-----+-----------+----------+

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = false)
 |-- dob: date (nullable = true)

