In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

spark = SparkSession.builder.appName("UserDefinedFunc").getOrCreate()

In [3]:
data = [
    ("U1", 85),
    ("U2", 72),
    ("U3", 40)
]
df= spark.createDataFrame(data, ["user_id","score"])

In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify(score):
  if score>=80:
    return "High"
  elif score>=50:
    return "Medium"
  else:
    return "Low"

performance_udf = udf(classify, StringType())

df=df.withColumn("performance", performance_udf("score"))
df.show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



In [5]:
df.withColumn(
    "performance",
    when(col("score") >= 80, "High")
    .when((col("score") >= 50) & (col("score") < 80), "Medium")
    .otherwise("Low")
).show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



Better to write in Spark native way, rather than python. Python is slow, read about this

In [7]:
df.orderBy("score").show()

+-------+-----+
|user_id|score|
+-------+-----+
|     U3|   40|
|     U2|   72|
|     U1|   85|
+-------+-----+



In [8]:
df.orderBy(df.score.desc()).show()

+-------+-----+
|user_id|score|
+-------+-----+
|     U1|   85|
|     U2|   72|
|     U3|   40|
+-------+-----+



In [11]:
df.orderBy("performance",df.score.desc()).show()


#To fix this error in above cell we never assigned value
#to original cell so we have to assign it to df and then we can show


+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U3|   40|        Low|
|     U2|   72|     Medium|
+-------+-----+-----------+

