# Using UDFs in PySpark

In [0]:
# Let's load our streaming events data
file_path = "/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

### USER DEFINED FUNCTIONS (UDFs)

In [0]:
# UDFs allow us to apply custom Python functions to our data

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, FloatType

# Let's create a simple UDF to calculate streaming quality score
@udf(FloatType())
def calculate_quality_score(duration, buffering_count):
    if duration is None or buffering_count is None:
        return None
    
    # Calculate a simple score where longer duration is good but buffering is bad
    # Higher score is better
    return (duration / 60.0) - (buffering_count * 10.0)

# Apply our UDF to the DataFrame
df_with_quality = df.withColumn("streaming_score", 
                               calculate_quality_score(df["duration_seconds"], df["buffering_count"]))

print("Data with streaming scores (Regular UDF):")
df_with_quality.select("duration_seconds", "buffering_count", "streaming_score").show(5)


Data with streaming scores (Regular UDF):
+----------------+---------------+---------------+
|duration_seconds|buffering_count|streaming_score|
+----------------+---------------+---------------+
|             565|              4|     -30.583334|
|            2018|              1|      23.633333|
|            2900|              3|      18.333334|
|            3242|              3|      24.033333|
|            4248|              1|           60.8|
+----------------+---------------+---------------+
only showing top 5 rows



In [0]:
# UDFs can significantly slow down processing since they operate row by row
# Let's check the execution plan to see the impact
print("Execution plan with Regular UDF:")
print("--------------------------------")
df_with_quality.explain(mode="formatted")

Execution plan with Regular UDF:
--------------------------------
== Physical Plan ==
* Project (3)
+- BatchEvalPython (2)
   +- Scan csv  (1)


(1) Scan csv 
Output [12]: [event_id#23, user_id#24, content_id#25, timestamp#26, duration_seconds#27, device_type#28, quality#29, buffering_count#30, error_type#31, ip_address#32, country#33, session_id#34]
Batched: false
Location: InMemoryFileIndex [dbfs:/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv]
ReadSchema: struct<event_id:string,user_id:string,content_id:string,timestamp:timestamp,duration_seconds:int,device_type:string,quality:string,buffering_count:int,error_type:string,ip_address:string,country:string,session_id:string>

(2) BatchEvalPython
Input [12]: [event_id#23, user_id#24, content_id#25, timestamp#26, duration_seconds#27, device_type#28, quality#29, buffering_count#30, error_type#31, ip_address#32, country#33, session_id#34]
Arguments: [calculate_quality_score(duration_seconds#27, buffering_cou

### VECTORIZED UDFs

In [0]:
# VECTORIZED UDFs
# Vectorized UDFs process data in batches using pandas, which dramatically improves performance

import pandas as pd
from pyspark.sql.functions import pandas_udf

# Define a vectorized UDF for calculating the SAME streaming score
# But using pandas for vectorized processing
@pandas_udf(FloatType())
def calculate_quality_score_vectorized(duration: pd.Series, buffering: pd.Series) -> pd.Series:
    # Same logic as the regular UDF, but operates on entire Series at once
    # This uses pandas' optimized C implementation for fast vector operations
    return (duration / 60.0) - (buffering * 10.0)

# Apply the vectorized UDF
df_with_score_vectorized = df.withColumn("streaming_score", 
                             calculate_quality_score_vectorized(df["duration_seconds"], df["buffering_count"]))

print("Data with streaming scores (Vectorized UDF):")
df_with_score_vectorized.select("duration_seconds", "buffering_count", "streaming_score").show(5)

Data with streaming scores (Vectorized UDF):
+----------------+---------------+---------------+
|duration_seconds|buffering_count|streaming_score|
+----------------+---------------+---------------+
|             565|              4|     -30.583334|
|            2018|              1|      23.633333|
|            2900|              3|      18.333334|
|            3242|              3|      24.033333|
|            4248|              1|           60.8|
+----------------+---------------+---------------+
only showing top 5 rows



In [0]:
# Let's see the execution plan difference with vectorized UDF
print("\nExecution plan with Vectorized (Pandas) UDF:")
print("-------------------------------------------")
df_with_score_vectorized.explain(mode="formatted")


Execution plan with Vectorized (Pandas) UDF:
-------------------------------------------
== Physical Plan ==
* Project (3)
+- ArrowEvalPython (2)
   +- Scan csv  (1)


(1) Scan csv 
Output [12]: [event_id#23, user_id#24, content_id#25, timestamp#26, duration_seconds#27, device_type#28, quality#29, buffering_count#30, error_type#31, ip_address#32, country#33, session_id#34]
Batched: false
Location: InMemoryFileIndex [dbfs:/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv]
ReadSchema: struct<event_id:string,user_id:string,content_id:string,timestamp:timestamp,duration_seconds:int,device_type:string,quality:string,buffering_count:int,error_type:string,ip_address:string,country:string,session_id:string>

(2) ArrowEvalPython
Input [12]: [event_id#23, user_id#24, content_id#25, timestamp#26, duration_seconds#27, device_type#28, quality#29, buffering_count#30, error_type#31, ip_address#32, country#33, session_id#34]
Arguments: [calculate_quality_score_vectorized

In [0]:
# Comparing performance between regular UDF and vectorized UDF
import time

# Force Spark to clear any previous caches or plans
spark.catalog.clearCache()

# Time the regular UDF
start_time = time.time()
result1 = df_with_quality.select("streaming_score")  # Use the DataFrame we already created with regular UDF
result1.count()  # Force execution
regular_time = time.time() - start_time
print(f"Regular UDF execution time: {regular_time:.2f} seconds")

# Time the vectorized UDF
start_time = time.time()
result2 = df_with_score_vectorized.select("streaming_score")  # Use the DataFrame we already created with vectorized UDF
result2.count()  # Force execution
vectorized_time = time.time() - start_time
print(f"Vectorized UDF execution time: {vectorized_time:.2f} seconds")

print(f"Speedup factor: {regular_time/vectorized_time:.2f}x faster with vectorized UDF")


Regular UDF execution time: 2.15 seconds
Vectorized UDF execution time: 0.55 seconds
Speedup factor: 3.90x faster with vectorized UDF


In [0]:
# Key differences in the explain output:
# 1. "ArrowEvalPython" instead of "BatchEvalPython" - uses Apache Arrow for efficient data transfer
# 2. Fewer conversion operations than regular UDFs
# 3. Better integration with Spark's optimization engine
# Note: Both UDF types still break Photon optimization, but vectorized UDFs are much more efficient