In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('instance').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/25 00:11:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/25 00:11:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Overview Spark UDF
* Spark provides robust predefined set of functions - `pyspark.sql.functions`
* At times you might need to develop custom UDF
  * no function availabe for our requrement while applying row transformation
  * we might have to use multple functions which readibility of the code is compromised

## Registering Spark UDFs
Steps needed to develop and use UDF 
* develop function logic in Python
* register the function using `spark.udf.register` - typically assigned to the variable which is a type of function
* variable can be used as part of Data Frame APIs such as `select`, `filter`, etc.
* when register we register with the name - can be used as part of `selectExpr` or as part of Spark SQL queries - `spark.sql`

## Using Spark UDFs as part of Data Frame APIs

In [40]:
from pyspark.sql.types import IntegerType
dc = spark.udf.register('date_convert', lambda d: int(d[:10].replace('-', '')), IntegerType())
dc

24/06/25 00:50:01 WARN SimpleFunctionRegistry: The function date_convert replaced a previously registered function.


<function __main__.<lambda>(d)>

In [41]:
df = spark.read.parquet('/Users/adhoc/git/retail_db/parquet/orders')
df.show()

+-----------------+--------------------+--------+---------------+----+-----+---+
|order_customer_id|          order_date|order_id|   order_status|year|month|day|
+-----------------+--------------------+--------+---------------+----+-----+---+
|             6471|2013-11-03T00:00:...|   15793|       COMPLETE|2013|   11|  3|
|             5323|2013-11-03T00:00:...|   15794|     PROCESSING|2013|   11|  3|
|            10096|2013-11-03T00:00:...|   15795|         CLOSED|2013|   11|  3|
|            11665|2013-11-03T00:00:...|   15796|       COMPLETE|2013|   11|  3|
|             6249|2013-11-03T00:00:...|   15797|PENDING_PAYMENT|2013|   11|  3|
|            10736|2013-11-03T00:00:...|   15798|       COMPLETE|2013|   11|  3|
|             5475|2013-11-03T00:00:...|   15799|       COMPLETE|2013|   11|  3|
|             7417|2013-11-03T00:00:...|   15800|     PROCESSING|2013|   11|  3|
|             4021|2013-11-03T00:00:...|   15801|       COMPLETE|2013|   11|  3|
|             2284|2013-11-0

In [42]:
df.select(dc('order_date').alias('order_date')).show()
df.filter(dc('order_date')==20140101).show()

+----------+
|order_date|
+----------+
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
+----------+
only showing top 20 rows

+-----------------+--------------------+--------+---------------+----+-----+---+
|order_customer_id|          order_date|order_id|   order_status|year|month|day|
+-----------------+--------------------+--------+---------------+----+-----+---+
|             3414|2014-01-01T00:00:...|   25876|PENDING_PAYMENT|2014|    1|  1|
|             5549|2014-01-01T00:00:...|   25877|PENDING_PAYMENT|2014|    1|  1|
|             9084|2014-01-01T00:00:...|   25878|        PENDING|2014|    1|  1|
|             5118|2014-01-01T00:00:...|   25879|        PENDING|2014|    1|  1|
|            10146|2014-01-01T00:00:...|   25880|       CANCELED|2014|    1|  1|
|             

In [15]:
df. \
    groupBy(dc('order_date').alias('order_date')). \
    count(). \
    withColumnRenamed('count', 'order_count'). \
    show()

+----------+-----------+
|order_date|order_count|
+----------+-----------+
|  20140303|        266|
|  20130824|        265|
|  20140619|        276|
|  20130914|        276|
|  20140219|        268|
|  20140613|        260|
|  20140201|        278|
|  20131105|        278|
|  20140402|        266|
|  20140421|        266|
|  20131231|        266|
|  20140331|        263|
|  20140516|        263|
|  20140715|        274|
|  20130905|        261|
|  20140409|        261|
|  20130925|        277|
|  20130906|        276|
|  20130927|        264|
|  20140720|        285|
+----------+-----------+
only showing top 20 rows



## Using Spark UDFs as part of Spark SQL

In [22]:
df.selectExpr("date_convert(order_date) order_date").show()

+----------+
|order_date|
+----------+
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
|  20131103|
+----------+
only showing top 20 rows



In [24]:
df.filter("date_convert(order_date) = 20140101").show()

+-----------------+--------------------+--------+---------------+----+-----+---+
|order_customer_id|          order_date|order_id|   order_status|year|month|day|
+-----------------+--------------------+--------+---------------+----+-----+---+
|             3414|2014-01-01T00:00:...|   25876|PENDING_PAYMENT|2014|    1|  1|
|             5549|2014-01-01T00:00:...|   25877|PENDING_PAYMENT|2014|    1|  1|
|             9084|2014-01-01T00:00:...|   25878|        PENDING|2014|    1|  1|
|             5118|2014-01-01T00:00:...|   25879|        PENDING|2014|    1|  1|
|            10146|2014-01-01T00:00:...|   25880|       CANCELED|2014|    1|  1|
|             3205|2014-01-01T00:00:...|   25881|PENDING_PAYMENT|2014|    1|  1|
|             4598|2014-01-01T00:00:...|   25882|       COMPLETE|2014|    1|  1|
|            11764|2014-01-01T00:00:...|   25883|        PENDING|2014|    1|  1|
|             7904|2014-01-01T00:00:...|   25884|PENDING_PAYMENT|2014|    1|  1|
|             7253|2014-01-0

In [27]:
df.createOrReplaceTempView('orders')
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   orders|       true|
+---------+---------+-----------+



In [35]:
spark. \
    sql('''
    select *, date_convert(order_date) order_date_as_int 
    from orders
    where date_convert(order_date)=20140101
    '''). \
    show()

+-----------------+--------------------+--------+---------------+----+-----+---+-----------------+
|order_customer_id|          order_date|order_id|   order_status|year|month|day|order_date_as_int|
+-----------------+--------------------+--------+---------------+----+-----+---+-----------------+
|             3414|2014-01-01T00:00:...|   25876|PENDING_PAYMENT|2014|    1|  1|         20140101|
|             5549|2014-01-01T00:00:...|   25877|PENDING_PAYMENT|2014|    1|  1|         20140101|
|             9084|2014-01-01T00:00:...|   25878|        PENDING|2014|    1|  1|         20140101|
|             5118|2014-01-01T00:00:...|   25879|        PENDING|2014|    1|  1|         20140101|
|            10146|2014-01-01T00:00:...|   25880|       CANCELED|2014|    1|  1|         20140101|
|             3205|2014-01-01T00:00:...|   25881|PENDING_PAYMENT|2014|    1|  1|         20140101|
|             4598|2014-01-01T00:00:...|   25882|       COMPLETE|2014|    1|  1|         20140101|
|         

In [37]:
spark. \
    sql('''
    select date_convert(order_date) order_date, count(1) order_count
    from orders
    group by 1 
    '''). \
    show()

+----------+-----------+
|order_date|order_count|
+----------+-----------+
|  20140303|        266|
|  20130824|        265|
|  20140619|        276|
|  20130914|        276|
|  20140219|        268|
|  20140613|        260|
|  20140201|        278|
|  20131105|        278|
|  20140402|        266|
|  20140421|        266|
|  20131231|        266|
|  20140331|        263|
|  20140516|        263|
|  20140715|        274|
|  20130905|        261|
|  20140409|        261|
|  20130925|        277|
|  20130906|        276|
|  20130927|        264|
|  20140720|        285|
+----------+-----------+
only showing top 20 rows



## Create Spark UDF to cleanse data in Spark Data Frame

In [61]:
import pandas as pd

courses = {
    'course_id': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
    'course_name': ['Mastering SQL', 'Streaming Pipelines - Python', 'Head First Python',
                    'Designing Data-Intensive Applications', 'Distributed Systems', 'Database Internals',
                    'Art of Immutable Architecture', 'Graph Databases', 'Building MicroServices',
                    'Kubernetes Patterns'],
    'course_author': ['Mike Jack', 'Bob Davis', 'Elvis Presley', 'Martin Kleppmann', 'Sukumar Ghosh', 
                      'Alex Petrov', 'Michael L. Perry', 'Ian Robinson', 'Sam Newman', 'Rolan Hub'],
    'course_status': ['   published   ', '   inactive   ', '\\N', 'published  ', '\\N', '   inactive',
                      'published   ', '\\N', '  inactive ', 'published   '],
    'course_published_dt': ['2020-07-08', '2020-03-10', '\\N', '2021-02-27', '\\N', '2021-05-14',
                            '2021-04-18', '\\N', '2020-12-15', '2021-07-11']
}
courses_df = spark.createDataFrame(pd.DataFrame(courses))
courses_df.show()
courses_df.dtypes

+---------+--------------------+----------------+---------------+-------------------+
|course_id|         course_name|   course_author|  course_status|course_published_dt|
+---------+--------------------+----------------+---------------+-------------------+
|        1|       Mastering SQL|       Mike Jack|   published   |         2020-07-08|
|        2|Streaming Pipelin...|       Bob Davis|    inactive   |         2020-03-10|
|        3|   Head First Python|   Elvis Presley|             \N|                 \N|
|        4|Designing Data-In...|Martin Kleppmann|    published  |         2021-02-27|
|        5| Distributed Systems|   Sukumar Ghosh|             \N|                 \N|
|        6|  Database Internals|     Alex Petrov|       inactive|         2021-05-14|
|        7|Art of Immutable ...|Michael L. Perry|   published   |         2021-04-18|
|        8|     Graph Databases|    Ian Robinson|             \N|                 \N|
|        9|Building MicroSer...|      Sam Newman|     

[('course_id', 'string'),
 ('course_name', 'string'),
 ('course_author', 'string'),
 ('course_status', 'string'),
 ('course_published_dt', 'string')]

In [62]:
users = {
    'user_id': ['1001', '1002', '1003', '1004', '1005', '1006'],
    'user_name': ['BenJohnson   ', '  Halley Battles ', '  Laura Anderson  ', '  Rolanda Garza ',
                  'Angela Fox  ', 'Kerl Goldinger '],
    'user_email': ['benjohn@gmail.com', '\\N', '\\N', 'garza.roland@gmail.com', 'nshaiary@aol.com',
                   'k.gold@live.com1'],
    'user_gender': ['Male', 'Male', 'Female', 'Male', 'Female', 'Male']
}
users_df = spark.createDataFrame(pd.DataFrame(users))
users_df.show()
users_df.dtypes

+-------+------------------+--------------------+-----------+
|user_id|         user_name|          user_email|user_gender|
+-------+------------------+--------------------+-----------+
|   1001|     BenJohnson   |   benjohn@gmail.com|       Male|
|   1002|   Halley Battles |                  \N|       Male|
|   1003|  Laura Anderson  |                  \N|     Female|
|   1004|    Rolanda Garza |garza.roland@gmai...|       Male|
|   1005|      Angela Fox  |    nshaiary@aol.com|     Female|
|   1006|   Kerl Goldinger |    k.gold@live.com1|       Male|
+-------+------------------+--------------------+-----------+



[('user_id', 'string'),
 ('user_name', 'string'),
 ('user_email', 'string'),
 ('user_gender', 'string')]

In [63]:
course_enrolments = {
    'course_id': ['3', '5', '8', '5', '6', '8', '7', '3'],
    'user_id': ['1001', '1001', '1003', '1003', '1005', '1006', '1001', '1001'],
    'enrollment_id': ['9010', '9020', '9030', '9040', '9050', '9060', '9070', '9080'],
    'grade': ['A', '\\N', 'A', '\\N', 'B', 'C', '\\N', 'A'],
    'department': ['AI  ', 'ML', '  CS', '  DS', '  AI', 'ML', '  CS', 'DS  ']
}
course_enrolments_df = spark.createDataFrame(pd.DataFrame(course_enrolments))

course_enrolments_df.show()
course_enrolments_df.dtypes

+---------+-------+-------------+-----+----------+
|course_id|user_id|enrollment_id|grade|department|
+---------+-------+-------------+-----+----------+
|        3|   1001|         9010|    A|      AI  |
|        5|   1001|         9020|   \N|        ML|
|        8|   1003|         9030|    A|        CS|
|        5|   1003|         9040|   \N|        DS|
|        6|   1005|         9050|    B|        AI|
|        8|   1006|         9060|    C|        ML|
|        7|   1001|         9070|   \N|        CS|
|        3|   1001|         9080|    A|      DS  |
+---------+-------+-------------+-----+----------+



[('course_id', 'string'),
 ('user_id', 'string'),
 ('enrollment_id', 'string'),
 ('grade', 'string'),
 ('department', 'string')]

In [70]:
def data_cleanse(c):
    return c.strip() if c.strip != '\\N' else None

In [71]:
data_cleanse = spark.udf.register('data_cleanse', data_cleanse)

24/06/25 01:20:32 WARN SimpleFunctionRegistry: The function data_cleanse replaced a previously registered function.


In [67]:
courses_df. \
    withColumn('course_status', data_cleanse('course_status')). \
    withColumn('course_published_dt', data_cleanse('course_published_dt')). \
    show()

+---------+--------------------+----------------+-------------+-------------------+
|course_id|         course_name|   course_author|course_status|course_published_dt|
+---------+--------------------+----------------+-------------+-------------------+
|        1|       Mastering SQL|       Mike Jack|    published|         2020-07-08|
|        2|Streaming Pipelin...|       Bob Davis|     inactive|         2020-03-10|
|        3|   Head First Python|   Elvis Presley|           \N|                 \N|
|        4|Designing Data-In...|Martin Kleppmann|    published|         2021-02-27|
|        5| Distributed Systems|   Sukumar Ghosh|           \N|                 \N|
|        6|  Database Internals|     Alex Petrov|     inactive|         2021-05-14|
|        7|Art of Immutable ...|Michael L. Perry|    published|         2021-04-18|
|        8|     Graph Databases|    Ian Robinson|           \N|                 \N|
|        9|Building MicroSer...|      Sam Newman|     inactive|         2020

In [68]:
courses_df.createOrReplaceTempView('courses')
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |  courses|       true|
|         |   orders|       true|
+---------+---------+-----------+



In [72]:
spark. \
    sql("select course_id, data_cleanse(course_status), data_cleanse(course_published_dt) from courses"). \
    show()

+---------+---------------------------+---------------------------------+
|course_id|data_cleanse(course_status)|data_cleanse(course_published_dt)|
+---------+---------------------------+---------------------------------+
|        1|                  published|                       2020-07-08|
|        2|                   inactive|                       2020-03-10|
|        3|                         \N|                               \N|
|        4|                  published|                       2021-02-27|
|        5|                         \N|                               \N|
|        6|                   inactive|                       2021-05-14|
|        7|                  published|                       2021-04-18|
|        8|                         \N|                               \N|
|        9|                   inactive|                       2020-12-15|
|       10|                  published|                       2021-07-11|
+---------+---------------------------