In [1]:
from pyspark.sql import SparkSession

In [2]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("SparkMllibExampleApp")
         .getOrCreate())

23/05/14 17:22:35 WARN Utils: Your hostname, choeyunseoui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 192.168.0.10 instead (on interface en0)
23/05/14 17:22:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/14 17:22:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 스파크 설정 변경하기

In [3]:
# 설정 변경 가능 여부 확인하기
spark.conf.isModifiable("spark.sql.shuffle.partitions")

True

In [4]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [6]:
spark.conf.get("spark.sql.shuffle.partitions")


'5'

### 캐싱과 영속화

- cache()

In [13]:
from pyspark.sql.functions import col
import time

In [8]:
df = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id") )

In [9]:
df.cache()
start = time.time()
df.count()
end = time.time()
print(end - start)

[Stage 0:>                                                          (0 + 8) / 8]

10.67982292175293


                                                                                

In [10]:
start = time.time()
df.count()
end = time.time()
print(end - start)

0.7317070960998535


- persist()

In [24]:
from pyspark.storagelevel import StorageLevel

In [33]:
df = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id") )

In [34]:
df.persist(StorageLevel.DISK_ONLY)
start = time.time()
df.count()
end = time.time()
print(end - start)

[Stage 18:>                                                         (0 + 8) / 8]

3.6994738578796387


                                                                                

In [35]:
start = time.time()
df.count()
end = time.time()
print(end - start)

0.4061131477355957


In [36]:
df.unpersist()

DataFrame[id: bigint, square: bigint]

### Suffle Sort Merge Join

In [3]:
import random

In [4]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") # 자동으로 SMJ 시행

In [5]:
states_dict = {0:"AZ", 1:"CO", 2:"CA", 3:"TX", 4:"NY", 5:"MI" }
items_dict = {0:"SKU-0", 1:"SKU-1", 2:"SKU-2", 3:"SKU-3", 4:"SKU-4", 5:"SKU-5"}

In [7]:
# usersDF 생성
usersDF = spark.range(1 * 10000000).rdd.map(lambda x: (str(x[0]),
                                                       "user_"+str(x[0]),
                                                       "user_"+str(x[0])+"@databricks.com",
                                                       states_dict[random.choice(range(6))])
                                            ).toDF(["uid", "login", "email", "user_state"])
#usersDF.show(6)

                                                                                

In [8]:
# ordersDF 생성
ordersDF = spark.range(1 * 10000000).rdd.map(lambda x: (x[0],
                                                        x[0],
                                                        str(random.choice(range(10001))),
                                                        10 * x[0] * 0.2,
                                                        states_dict[random.choice(range(6))],
                                                        items_dict[random.choice(range(6))])
                                             ).toDF(["transaction_id", "quantity", "users_id", "amount", "state", "items"])
#ordersDF.show(6)

In [9]:
# join
usersOrdersDF = ordersDF.join(usersDF, ordersDF.users_id == usersDF.uid)

In [10]:
usersOrdersDF.show()

                                                                                

+--------------+--------+--------+--------+-----+-----+----+---------+--------------------+----------+
|transaction_id|quantity|users_id|  amount|state|items| uid|    login|               email|user_state|
+--------------+--------+--------+--------+-----+-----+----+---------+--------------------+----------+
|         11549|   11549|    1008| 23098.0|   NY|SKU-4|1008|user_1008|user_1008@databri...|        NY|
|         14093|   14093|    1008| 28186.0|   CA|SKU-2|1008|user_1008|user_1008@databri...|        NY|
|         23123|   23123|    1008| 46246.0|   MI|SKU-2|1008|user_1008|user_1008@databri...|        NY|
|         29264|   29264|    1008| 58528.0|   CA|SKU-3|1008|user_1008|user_1008@databri...|        NY|
|         47249|   47249|    1008| 94498.0|   CO|SKU-4|1008|user_1008|user_1008@databri...|        NY|
|         67306|   67306|    1008|134612.0|   MI|SKU-0|1008|user_1008|user_1008@databri...|        NY|
|         78767|   78767|    1008|157534.0|   AZ|SKU-0|1008|user_1008|use

In [11]:
usersOrdersDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#16], [uid#4], Inner
   :- Sort [users_id#16 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(users_id#16, 200), ENSURE_REQUIREMENTS, [plan_id=148]
   :     +- Filter isnotnull(users_id#16)
   :        +- Scan ExistingRDD[transaction_id#14L,quantity#15L,users_id#16,amount#17,state#18,items#19]
   +- Sort [uid#4 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(uid#4, 200), ENSURE_REQUIREMENTS, [plan_id=149]
         +- Filter isnotnull(uid#4)
            +- Scan ExistingRDD[uid#4,login#5,email#6,user_state#7]




### bucketing을 적용한 SMJ

In [31]:
# usersDF bucketing
usersDF.orderBy(col("uid").asc()) \
       .write.format("parquet") \
       .bucketBy(8, "uid") \
       .mode("overwrite") \
       .saveAsTable("UsersTbl")

# ordersDF bucketing
ordersDF.orderBy(col("users_id").asc()) \
       .write.format("parquet") \
       .bucketBy(8, "users_id") \
       .mode("overwrite") \
       .saveAsTable("OrdersTbl")

# table caching
spark.sql("CACHE TABLE UsersTbl")
spark.sql("CACHE TABLE OrdersTbl")

# reread
usersBucketDF = spark.table("UsersTbl")
ordersBucketDF = spark.table("OrdersTbl")

# join
joinUsersOrdersBucketDF = ordersBucketDF.join(usersBucketDF, ordersBucketDF.users_id == usersBucketDF.uid)

[Stage 11:>                                                         (0 + 8) / 9]

23/05/14 17:37:56 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:37:56 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/05/14 17:38:00 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:38:00 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:38:00 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/05/14 17:38:02 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:38:02 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

[Stage 15:>                                                         (0 + 8) / 9]

23/05/14 17:39:32 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:39:32 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/05/14 17:39:35 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:39:35 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:39:35 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/05/14 17:39:36 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/14 17:39:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

[Stage 16:>                                                         (0 + 8) / 8]

23/05/14 17:40:09 WARN MemoryStore: Not enough space to cache rdd_60_1 in memory! (computed 37.0 MiB so far)
23/05/14 17:40:09 WARN MemoryStore: Not enough space to cache rdd_60_6 in memory! (computed 37.0 MiB so far)
23/05/14 17:40:09 WARN MemoryStore: Not enough space to cache rdd_60_4 in memory! (computed 37.0 MiB so far)
23/05/14 17:40:09 WARN BlockManager: Persisting block rdd_60_6 to disk instead.
23/05/14 17:40:09 WARN BlockManager: Persisting block rdd_60_4 to disk instead.
23/05/14 17:40:09 WARN BlockManager: Persisting block rdd_60_1 to disk instead.
23/05/14 17:40:22 WARN MemoryStore: Not enough space to cache rdd_60_7 in memory! (computed 64.4 MiB so far)
23/05/14 17:40:22 WARN MemoryStore: Not enough space to cache rdd_60_0 in memory! (computed 64.4 MiB so far)
23/05/14 17:40:22 WARN BlockManager: Persisting block rdd_60_0 to disk instead.
23/05/14 17:40:22 WARN BlockManager: Persisting block rdd_60_7 to disk instead.
23/05/14 17:40:22 WARN MemoryStore: Not enough space to

                                                                                

In [32]:
joinUsersOrdersBucketDF.show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------------+--------+--------+-----------+-----+-----+---+------+--------------------+----------+
|transaction_id|quantity|users_id|     amount|state|items|uid| login|               email|user_state|
+--------------+--------+--------+-----------+-----+-----+---+------+--------------------+----------+
|       3764678| 3764678|       1|  7529356.0|   MI|SKU-2|  1|user_1|user_1@databricks...|        CO|
|       6264729| 6264729|       1|1.2529458E7|   NY|SKU-5|  1|user_1|user_1@databricks...|        CO|
|       3779537| 3779537|       1|  7559074.0|   CO|SKU-3|  1|user_1|user_1@databricks...|        CO|
|       6265212| 6265212|       1|1.2530424E7|   CO|SKU-3|  1|user_1|user_1@databricks...|        CO|
|       3784657| 3784657|       1|  7569314.0|   NY|SKU-4|  1|user_1|user_1@databricks...|        CO|
|       6277990| 6277990|       1| 1.255598E7|   CA|SKU-2|  1|user_1|user_1@databricks...|        CO|
|       3784702| 3784702|       1|  7569404.0|   NY|SKU-2|  1|user_1|user_1@databr

                                                                                

In [33]:
joinUsersOrdersBucketDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(3) SortMergeJoin [users_id#268], [uid#129], Inner
   :- *(1) Sort [users_id#268 ASC NULLS FIRST], false, 0
   :  +- *(1) Filter isnotnull(users_id#268)
   :     +- Scan In-memory table OrdersTbl [transaction_id#266L, quantity#267L, users_id#268, amount#269, state#270, items#271], [isnotnull(users_id#268)]
   :           +- InMemoryRelation [transaction_id#266L, quantity#267L, users_id#268, amount#269, state#270, items#271], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                 +- *(1) ColumnarToRow
   :                    +- FileScan parquet default.orderstbl[transaction_id#266L,quantity#267L,users_id#268,amount#269,state#270,items#271] Batched: true, Bucketed: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/yschoi/Library/CloudStorage/Dropbox/yunseo/development/BOA..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<transaction_id:big