In [0]:
airlines_raw_df = spark.read\
                .format("csv")\
                    .option("inferSchema", "true")\
                        .option("header", "true")\
                            .option("samplingRatio", "0.0001")\
                                .load("/databricks-datasets/airlines/part-00000")


In [0]:
airlines_df = airlines_raw_df.selectExpr("year",
                                         "month",
                                         "dayofmonth",
                                         "flightnum",
                                         "DepTime - CRSDepTime as delay")\
                                             .where("FlightNum == 1451")

In [0]:
display(airlines_df)

year,month,dayofmonth,flightnum,delay
1987,10,14,1451,11.0
1987,10,15,1451,-1.0
1987,10,17,1451,11.0
1987,10,18,1451,-1.0
1987,10,19,1451,19.0
1987,10,21,1451,-2.0
1987,10,22,1451,-2.0
1987,10,23,1451,1.0
1987,10,24,1451,14.0
1987,10,25,1451,-1.0



Our objective is to add a unique id for each record.

Approaches:
1. dataframe function: monotonically_increasing_id()
2. built in sql function: uuid()


Built in functions can be used with expr function.

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, expr, lit

# generate and add unique id for each record
airlines_df.withColumn("id", monotonically_increasing_id())\
    .withColumn("RecordID", expr("uuid()"))\
        .withColumn("BatchID", lit("batch-123456789"))\
            .show(5, False)

+----+-----+----------+---------+-----+---+------------------------------------+---------------+
|year|month|dayofmonth|flightnum|delay|id |RecordID                            |BatchID        |
+----+-----+----------+---------+-----+---+------------------------------------+---------------+
|1987|10   |14        |1451     |11   |0  |31a0d153-ebe1-4d67-8bad-83ccb0f2791d|batch-123456789|
|1987|10   |15        |1451     |-1   |1  |d6b703d8-206d-4447-a0f0-4f62b4febb9c|batch-123456789|
|1987|10   |17        |1451     |11   |2  |10e50d64-7bf2-407e-bf85-9033313262dd|batch-123456789|
|1987|10   |18        |1451     |-1   |3  |ab548888-941c-429e-8ac7-1695e3ef0830|batch-123456789|
|1987|10   |19        |1451     |19   |4  |e71839c2-64d5-4a8d-b7e2-f3d793174d74|batch-123456789|
+----+-----+----------+---------+-----+---+------------------------------------+---------------+
only showing top 5 rows



In [0]:
# alternate code

import uuid
uid = str(uuid.uuid4())

# generate and add unique id for each record
airlines_df.withColumn("id", monotonically_increasing_id())\
    .withColumn("RecordID", expr("uuid()"))\
        .withColumn("BatchID", lit(uid))\
            .show(5, False)

+----+-----+----------+---------+-----+---+------------------------------------+------------------------------------+
|year|month|dayofmonth|flightnum|delay|id |RecordID                            |BatchID                             |
+----+-----+----------+---------+-----+---+------------------------------------+------------------------------------+
|1987|10   |14        |1451     |11   |0  |4133b1d1-1104-44e3-b06a-8da4c68bb1fa|e136c5ac-00f2-460c-a5b1-185b86040d4a|
|1987|10   |15        |1451     |-1   |1  |c4c4f3cb-fb0d-4767-9312-2163e7f9e536|e136c5ac-00f2-460c-a5b1-185b86040d4a|
|1987|10   |17        |1451     |11   |2  |57552401-cad3-4553-9183-98896b167432|e136c5ac-00f2-460c-a5b1-185b86040d4a|
|1987|10   |18        |1451     |-1   |3  |73ae4316-90a8-42ad-b75c-863a1ea52f36|e136c5ac-00f2-460c-a5b1-185b86040d4a|
|1987|10   |19        |1451     |19   |4  |0b183307-4839-4a2c-b12a-5de9f0ea95de|e136c5ac-00f2-460c-a5b1-185b86040d4a|
+----+-----+----------+---------+-----+---+-------------