In [48]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName('UDF_Practice').getOrCreate()

In [77]:
help(udf)

Help on function udf in module pyspark.sql.functions:

udf(f: Union[Callable[..., Any], ForwardRef('DataTypeOrString'), NoneType] = None, returnType: 'DataTypeOrString' = StringType(), *, useArrow: Optional[bool] = None) -> Union[ForwardRef('UserDefinedFunctionLike'), Callable[[Callable[..., Any]], ForwardRef('UserDefinedFunctionLike')]]
    Creates a user defined function (UDF).
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    f : function
        python function if used as a standalone function
    returnType : :class:`pyspark.sql.types.DataType` or str
        the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
    useArrow : bool or None
        whether to use Arrow to optimize the (de)serialization. When it is None, the
        Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes

In [67]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [68]:
def convertCase(str):
    resStr = ''
    arr = str.split(' ')
    for x in arr:
        resStr = resStr + x[0:1].upper()+x[1:len(x)]+' '
    return resStr

In [69]:
convertUDF = udf(lambda z:convertCase(z))

In [72]:
df.select(col('Seqno'), convertUDF(col('Name')).alias('Name')).show()

+-----+-------------+
|Seqno|         Name|
+-----+-------------+
|    1|  John Jones |
|    2|Tracey Smith |
|    3| Amy Sanders |
+-----+-------------+



In [53]:
def upperCase(str):
    return str.upper()

upperCaseUDF = udf(lambda z: upperCase(z))

In [54]:
df.withColumn('curated Name', upperCaseUDF(col('Name'))).show()

+-----+------------+------------+
|Seqno|        Name|curated Name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [55]:
spark.udf.register('convertUDFSQL', convertCase)
df.createOrReplaceTempView('name_table')
spark.sql('select seqno, convertUDFSQL(name) as name from name_table').show()

24/01/27 19:23:27 WARN SimpleFunctionRegistry: The function convertudfsql replaced a previously registered function.


+-----+-------------+
|seqno|         name|
+-----+-------------+
|    1|  John Jones |
|    2|Tracey Smith |
|    3| Amy Sanders |
+-----+-------------+



In [56]:
df.select('name', convertUDF(col('name')).alias('name')).show()

+------------+-------------+
|        name|         name|
+------------+-------------+
|  john jones|  John Jones |
|tracey smith|Tracey Smith |
| amy sanders| Amy Sanders |
+------------+-------------+



In [57]:
@udf
def upperC(str):
    return str.upper() if str is not None else ' '

df.withColumn('Curated name', upperC(col('name'))).show()

+-----+------------+------------+
|Seqno|        Name|Curated name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [58]:

df.withColumn('Curated name', upperC(col('name'))).show()

+-----+------------+------------+
|Seqno|        Name|Curated name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [59]:
data = [(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,)]
columns = ["numbers"]
df = spark.createDataFrame(data, columns)

# Define a filter condition using boolean expressions
filtered_df = df.filter((col("numbers") % 2 == 0) & (col("numbers") > 5))

# Show the result
filtered_df.show()

+-------+
|numbers|
+-------+
|      6|
|      8|
|     10|
+-------+



In [38]:
x = 't'
print(x.upper())

T


In [73]:
""" null check """

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ('4',None)]

df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")
spark.udf.register('convertUDFS',convertCase)
spark.sql("select convertUDFS(Name) from NAME_TABLE2") \
     .show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
|4    |NULL        |
+-----+------------+



24/01/27 19:29:57 ERROR Executor: Exception in task 10.0 in stage 101.0 (TID 543)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_4665/1347373182.py", line 3, in convertCase
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spar

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_4665/1347373182.py", line 3, in convertCase
AttributeError: 'NoneType' object has no attribute 'split'


In [74]:
spark.udf.register("_nullsafeUDF", lambda str: convertCase(str) if not str is None else "" , StringType())

<function __main__.<lambda>(str)>

In [76]:
spark.sql("select _nullsafeUDF(Name) from NAME_TABLE2") \
     .show(truncate=False)

+------------------+
|_nullsafeUDF(Name)|
+------------------+
|John Jones        |
|Tracey Smith      |
|Amy Sanders       |
|                  |
+------------------+



In [78]:
from pyspark.sql.types import IntegerType
import random
random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
