In [1]:
# Step 1: Setup Spark session with safe temp directory
import os
from pyspark.sql import SparkSession

# Set a writable local directory to avoid read-only errors
os.environ["SPARK_LOCAL_DIRS"] = "/tmp/spark-temp"
os.makedirs("/tmp/spark-temp", exist_ok=True)

# Create Spark session
spark = SparkSession.builder \
    .appName("UDF Example") \
    .master("local[*]") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
    .getOrCreate()

print("Spark started:", spark.version)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/25 07:19:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/25 07:19:37 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in standalone/kubernetes and LOCAL_DIRS in YARN).


Spark started: 4.0.0


In [2]:
# Step 2: Create sample DataFrame
data = [(1,), (2,), (3,)]
df = spark.createDataFrame(data, ["id"])
df.show()


                                                                                

+---+
| id|
+---+
|  1|
|  2|
|  3|
+---+



In [3]:
# Step 3: Define and apply a UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def square(x):
    return x * x

squared_udf = udf(square, IntegerType())

# Apply UDF to add a new column
df_result = df.withColumn("id_squared", squared_udf(df["id"]))
df_result.show()

+---+----------+
| id|id_squared|
+---+----------+
|  1|         1|
|  2|         4|
|  3|         9|
+---+----------+



In [4]:
# Step 4: Stop Spark session
spark.stop()