#### Spark Session 생성

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

## 드라이버 및 실행기 메모리 제한 

spark = SparkSession.builder \
    .appName("Python Spark UDF") \
    .master("local[2]") \
    .config("spark.driver.memory","2g") \
    .config("spark.executor.memory","2g") \
    .getOrCreate()

#### Dataframe/SQL에 UDF사용해보기 #1

In [2]:
columns = ["Seqno","Name"]
data = [(1, "john jones"),
        (2, "tracey smith"),
        (3, "amy sanders")]

df = spark.createDataFrame(data=data, schema=columns)

df.show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

upperUDF = F.udf(lambda z:z.upper())

df.withColumn("Curated Name", upperUDF("Name")).show(truncate=False)

+-----+------------+------------+
|Seqno|Name        |Curated Name|
+-----+------------+------------+
|1    |john jones  |JOHN JONES  |
|2    |tracey smith|TRACEY SMITH|
|3    |amy sanders |AMY SANDERS |
+-----+------------+------------+



In [4]:
def upper_udf(s):
    return s.upper()

In [5]:
upperUDF = F.udf(upper_udf, StringType())

df.withColumn("Curated Name", upperUDF("Name")).show(truncate=False)

+-----+------------+------------+
|Seqno|Name        |Curated Name|
+-----+------------+------------+
|1    |john jones  |JOHN JONES  |
|2    |tracey smith|TRACEY SMITH|
|3    |amy sanders |AMY SANDERS |
+-----+------------+------------+



In [6]:
df.select("Name", upperUDF("Name").alias("Curated Name")).show()

+------------+------------+
|        Name|Curated Name|
+------------+------------+
|  john jones|  JOHN JONES|
|tracey smith|TRACEY SMITH|
| amy sanders| AMY SANDERS|
+------------+------------+



In [7]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf(StringType())
def upper_udf_f(s:pd.Series)-> pd.Series:
    return s.str.upper()

In [8]:
upperUDF = spark.udf.register("upper_udf",upper_udf_f)
spark.sql("SELECT upper_udf('aBcD')").show()

+---------------+
|upper_udf(aBcD)|
+---------------+
|           ABCD|
+---------------+



In [9]:
df.select("name", upperUDF("name")).show()

+------------+---------------+
|        name|upper_udf(name)|
+------------+---------------+
|  john jones|     JOHN JONES|
|tracey smith|   TRACEY SMITH|
| amy sanders|    AMY SANDERS|
+------------+---------------+



In [10]:
df.createOrReplaceTempView("test")
spark.sql("""
    SELECT name, upper_udf(name) `Curated Name` FROM test
""").show()

+------------+------------+
|        name|Curated Name|
+------------+------------+
|  john jones|  JOHN JONES|
|tracey smith|TRACEY SMITH|
| amy sanders| AMY SANDERS|
+------------+------------+



#### Dataframe/SQL에 UDF사용해보기 #2

In [11]:
data = [
    {"a": 1, "b":2},
    {"a": 5, "b":5}
]

df = spark.createDataFrame(data)
df.withColumn("c", F.udf(lambda x, y : x+y)("a" , "b")).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  5|  5| 10|
+---+---+---+



In [12]:
def plus(x, y):
    return x + y

plusUDF = spark.udf.register("plus",plus)
spark.sql("SELECT plus(1, 2) sum").show()

+---+
|sum|
+---+
|  3|
+---+



In [13]:
df.withColumn("p", plusUDF("a","b")).show()

+---+---+---+
|  a|  b|  p|
+---+---+---+
|  1|  2|  3|
|  5|  5| 10|
+---+---+---+



In [14]:
df.createOrReplaceTempView("test")
spark.sql("SELECT a, b, plus(a, b) sum FROM test").show()

+---+---+---+
|  a|  b|sum|
+---+---+---+
|  1|  2|  3|
|  5|  5| 10|
+---+---+---+



#### Dataframe에서 UDAF 사용해보기

In [15]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf(FloatType())
def average_udf_f(v: pd.Series) -> float:
    return v.mean()

averageUDF = spark.udf.register('average_udf', average_udf_f)
spark.sql('SELECT average_udf(b) FROM test').show()

+--------------+
|average_udf(b)|
+--------------+
|           3.5|
+--------------+



In [10]:
df.agg(averageUDF("b").alias("count")).show()

+-----+
|count|
+-----+
|  3.5|
+-----+



#### DataFrame에 explode 사용해보기

In [16]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]

df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.show()

+----------+-------------------+--------------------+
|      name|     knownLanguages|          properties|
+----------+-------------------+--------------------+
|     James|      [Java, Scala]|{eye -> brown, ha...|
|   Michael|[Spark, Java, NULL]|{eye -> NULL, hai...|
|    Robert|         [CSharp, ]|{eye -> , hair ->...|
|Washington|               NULL|                NULL|
| Jefferson|             [1, 2]|                  {}|
+----------+-------------------+--------------------+



In [17]:
from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.knownLanguages))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  NULL|
|   Robert|CSharp|
|   Robert|      |
|Jefferson|     1|
|Jefferson|     2|
+---------+------+



#### 하나의 레코드에서 다수의 레코드를 만들어내는 예제(Order to 1+Items)
데이터 경로지정

In [18]:
import os
os.chdir("../../data")

In [20]:
path = os.getcwd()

#### Spark로 해보기

In [21]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

order = spark.read.options(delimiter='\t').option("header","true").csv(path + "/orders.csv")

In [None]:
order.show()

In [22]:
order.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- items: string (nullable = true)



In [23]:
struct = ArrayType(
    StructType([
        StructField("name", StringType()),
        StructField("id", StringType()),
        StructField("quantity", LongType())
    ])
)

In [None]:
order.withColumn("item", explode(from_json("items", struct))).show(truncate=False)

In [24]:
order_items = order.withColumn("item", explode(from_json("items", struct))).drop("items")

In [25]:
order_items.show(5)

+------------+--------------------+
|    order_id|                item|
+------------+--------------------+
|860196503764|{DAILY SPF, 18837...|
|860292645076|{DAILY SPF — Bund...|
|860320956628|{DAILY SPF, 18839...|
|860321513684|{DAILY SPF, 18839...|
|862930665684|{DAILY SPF, 18879...|
+------------+--------------------+
only showing top 5 rows



In [26]:
order_items.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- quantity: long (nullable = true)



In [27]:
order_items.createOrReplaceTempView("order_items")

In [28]:
spark.sql("""
SELECT order_id, CAST(average_udf(item.quantity) as decimal) avg_count
FROM order_items
GROUP BY 1
ORDER BY 2 DESC
""").show(5)

+-------------+---------+
|     order_id|avg_count|
+-------------+---------+
|1816674631892|      500|
|1821860430036|      300|
|2186043064532|      208|
|2118824558804|      200|
|2143034474708|      200|
+-------------+---------+
only showing top 5 rows



In [30]:
spark.sql("SELECT item.quantity FROM order_items WHERE order_id = '1816674631892'").show()

+--------+
|quantity|
+--------+
|     500|
+--------+



In [31]:
spark.catalog.listTables()

[Table(name='order_items', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='test', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [32]:
for f in spark.catalog.listFunctions():
    print(f[0])

!
!=
%
&
*
+
-
/
<
<=
<=>
<>
=
==
>
>=
^
abs
acos
acosh
add_months
aes_decrypt
aes_encrypt
aggregate
and
any
any_value
approx_count_distinct
approx_percentile
array
array_agg
array_append
array_compact
array_contains
array_distinct
array_except
array_insert
array_intersect
array_join
array_max
array_min
array_position
array_prepend
array_remove
array_repeat
array_size
array_sort
array_union
arrays_overlap
arrays_zip
ascii
asin
asinh
assert_true
atan
atan2
atanh
avg
base64
between
bigint
bin
binary
bit_and
bit_count
bit_get
bit_length
bit_or
bit_xor
bitmap_bit_position
bitmap_bucket_number
bitmap_construct_agg
bitmap_count
bitmap_or_agg
bool_and
bool_or
boolean
bround
btrim
cardinality
case
cast
cbrt
ceil
ceiling
char
char_length
character_length
chr
coalesce
collect_list
collect_set
concat
concat_ws
contains
conv
convert_timezone
corr
cos
cosh
cot
count
count_if
count_min_sketch
covar_pop
covar_samp
crc32
csc
cume_dist
curdate
current_catalog
current_database
current_date
current_schem