In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType

In [3]:
ss = SparkSession.builder\
.appName('UDF')\
.getOrCreate()

ss

24/12/11 16:09:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [1]:
datas = [
    ("A", "2022-04-16", 31200),
    ("B", "2022-04-17", 41200),
    ("C", "2022-04-11", 31500),
    ("D", "2022-04-12", 21500),
    ("E", "2022-04-13", 51000)
]
columns = ["product", "date", "price"]

In [4]:
df = ss.createDataFrame(data=datas, schema = columns)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+----------+-----+
|product|      date|price|
+-------+----------+-----+
|      A|2022-04-16|31200|
|      B|2022-04-17|41200|
|      C|2022-04-11|31500|
|      D|2022-04-12|21500|
|      E|2022-04-13|51000|
+-------+----------+-----+



                                                                                

### UDF(User Defined Function)
- 쿼리문에서 사용하는 사용자 정의 함수

In [15]:
def squared(n):
    return n*n #함수를 정의하면 return이 집합으로 반환함

In [17]:
ss.udf.register('udf_squared', squared, LongType())

<function __main__.squared(n)>

In [11]:
df.createOrReplaceTempView('datas')

In [44]:
ss.sql('''
    SELECT
        udf_squared(price) as udf_result
    FROM datas
''').show()

ss.sql('''
    SELECT
        format_number(udf_squared(price),0) as udf_result
    FROM datas
''').show()

+----------+
|udf_result|
+----------+
| 973440000|
|1697440000|
| 992250000|
| 462250000|
|2601000000|
+----------+

+-------------+
|   udf_result|
+-------------+
|  973,440,000|
|1,697,440,000|
|  992,250,000|
|  462,250,000|
|2,601,000,000|
+-------------+



In [47]:
def read_number(n):
    units = ['','십 ', '백 ', '천 ', '만 ', '억 ', '조 ', '경 ','해 ', '자 ', '양 ', '구 ', '간 ', '정 ', '재 ', '극 ', '항하사 ', '아승기 ', '나유타 ', '불가사의 ', '무량수/무량대수 ']
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0
    while n>0:
        n,r = divmod(n,10)
        if r > 0:
            result.append(nums[r-1]+units[i])
        i += 1
    return ''.join(reversed(result))

In [48]:
read_number(123121231231323123)

'일아승기 이항하사 삼극 일재 이정 일간 이구 삼양 일자 이해 삼경 일조 삼억 이만 삼천 일백 이십 삼'

In [39]:
ss.udf.register('udf_read_number', read_number)

<function __main__.read_number(n)>

In [41]:
ss.sql('''
    select
        udf_read_number(price)
    from datas        
''').show()

+----------------------+
|udf_read_number(price)|
+----------------------+
|          삼만일천이백|
|          사만일천이백|
|          삼만일천오백|
|          이만일천오백|
|              오만일천|
+----------------------+



In [49]:
ss.stop()