In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("User_defined_functions") \
    .getOrCreate()

In [None]:
data = [
    ("U1",85),
    ("U2",72),
    ("U3",40)
]

df= spark.createDataFrame(data, ["user_id","score"])


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify(score):
  if score >= 80:
    return "High"
  elif score >=50:
    return "Medium"
  else:
    return "Low"

performance_udf = udf(classify, StringType())
df = df.withColumn("performance",performance_udf("score"))
df.show()





+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



More efficient than UDFs, but when requirements are for UDF we can use it

In [None]:
from pyspark.sql.functions import when, col

df.withColumn(
    "performance",
    when(col("score")>=80,"High")
    .when(col("score")>=50,"Medium")
    .otherwise("Low")
).show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



In [None]:
df.orderBy("performance", df.score.desc()).show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U3|   40|        Low|
|     U2|   72|     Medium|
+-------+-----+-----------+

