# Description
----
Besides mitigating costs, we also want to consider how to optimize and tune Spark. In this chapter, we will discuss a set of Spark configurations that enable optimizations, look at Spark’s family of join strategies, and inspect the Spark UI, looking for clues to bad behavior.



# Setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Imports

In [2]:
import os
import os.path as path

# Spark

In [66]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = (SparkSession
         .builder
         .master('local[*]')
         .appName("SparkSQLExampleApp")
         .config('ui.showConsoleProgress', 'false')
         .getOrCreate())

In [118]:
spark

# Functions

In [3]:
def db_fname(fname):
    import os.path as path
    data_dir = '~/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/'
    return path.expanduser(path.join(data_dir, fname))

# View Settings

In [7]:
!which pyspark

/Users/bartev/.venvs3/lrnpyspark/bin/pyspark


In [8]:
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [79]:
conf = SparkConf()

In [80]:
conf.getAll()

[('spark.app.name', 'SparkSQLExampleApp'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [81]:
spark.sparkContext.master

'local[*]'

In [82]:
sc = spark.sparkContext

In [90]:
spark.sparkContext.getConf().getAll()

[('spark.app.name', 'SparkSQLExampleApp'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '62816'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1602131661660'),
 ('spark.master', 'local[*]'),
 ('ui.showConsoleProgress', 'false'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '192.168.1.17'),
 ('spark.ui.showConsoleProgress', 'true')]

In [84]:
sc.sparkUser()

'bartev'

In [25]:
sc.version

'2.4.5'

In [26]:
type(spark)

pyspark.sql.session.SparkSession

In [35]:
spark

## Print all configs

In [68]:
def print_configs(session: SparkSession):
    """print all the configs for the current SparkSession
    
    Usage:
    > print_configs(spark)
    
    spark.app.name SparkSQLExampleApp
    spark.rdd.compress True
    spark.driver.port 62816
    spark.serializer.objectStreamReset 100
    spark.app.id local-1602131661660
    spark.master local[*]
    spark.executor.id driver
    spark.submit.deployMode client
    spark.driver.host 192.168.1.17
    spark.ui.showConsoleProgress true
    """        
    conf = spark.sparkContext.getConf()
    conf_dict = {k: v for k, v in conf.getAll()}
    
    for k in sorted(conf_dict.keys()):
        print(k, conf_dict[k])

    
print_configs(spark)

spark.app.id local-1602131661660
spark.app.name SparkSQLExampleApp
spark.driver.host 192.168.1.17
spark.driver.port 62816
spark.executor.id driver
spark.master local[*]
spark.rdd.compress True
spark.serializer.objectStreamReset 100
spark.submit.deployMode client
spark.ui.showConsoleProgress true
ui.showConsoleProgress false


## View Spark SQL specific spark configs

In [103]:
(spark.sql("SET -v")
 .select('key', 'value')
#  .filter("key like '%statistics%'")
 .show(10, truncate=False))

+-----------------------------------------------------+---------+
|key                                                  |value    |
+-----------------------------------------------------+---------+
|spark.sql.adaptive.enabled                           |false    |
|spark.sql.adaptive.shuffle.targetPostShuffleInputSize|67108864b|
|spark.sql.autoBroadcastJoinThreshold                 |10485760 |
|spark.sql.avro.compression.codec                     |snappy   |
|spark.sql.avro.deflate.level                         |-1       |
|spark.sql.broadcastTimeout                           |300000ms |
|spark.sql.cbo.enabled                                |false    |
|spark.sql.cbo.joinReorder.dp.star.filter             |false    |
|spark.sql.cbo.joinReorder.dp.threshold               |12       |
|spark.sql.cbo.joinReorder.enabled                    |false    |
+-----------------------------------------------------+---------+
only showing top 10 rows



In [104]:
sc = spark.sparkContext

In [108]:
spark.sparkContext.getConf().get('spark.dynamicAllocation.enabled')
spark.sparkContext.getConf().set('spark.dynamicAllocation.enabled', "true")

<pyspark.conf.SparkConf at 0x120a76438>

In [109]:
print_configs(spark)

spark.app.id local-1602131661660
spark.app.name SparkSQLExampleApp
spark.driver.host 192.168.1.17
spark.driver.port 62816
spark.executor.id driver
spark.master local[*]
spark.rdd.compress True
spark.serializer.objectStreamReset 100
spark.submit.deployMode client
spark.ui.showConsoleProgress true
ui.showConsoleProgress false


In [112]:
spark.sparkContext.getConf().getAll()

[('spark.app.name', 'SparkSQLExampleApp'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '62816'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1602131661660'),
 ('spark.master', 'local[*]'),
 ('ui.showConsoleProgress', 'false'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '192.168.1.17'),
 ('spark.ui.showConsoleProgress', 'true')]

In [116]:
spark.sparkContext.getConf().contains('spark.dynamicAllocation.enabled')

False

In [117]:
spark.sparkContext.getConf().set('spark.dynamicAllocation.enabled', "true")

<pyspark.conf.SparkConf at 0x1209c7080>

# Caching and Persistence of data

Example

In [121]:
from pyspark.sql.functions import col

In [148]:
df1 = (spark.range(1 * 10000000)
      .toDF('id')
      .withColumn('square', col('id') * col('id')))

In [149]:
%%time
df1.count()

CPU times: user 882 µs, sys: 1.57 ms, total: 2.45 ms
Wall time: 78.7 ms


10000000

In [150]:
%%time
df1.cache()

CPU times: user 794 µs, sys: 940 µs, total: 1.73 ms
Wall time: 996 µs


DataFrame[id: bigint, square: bigint]

In [151]:
%%time
df1.count()

CPU times: user 788 µs, sys: 1.33 ms, total: 2.12 ms
Wall time: 277 ms


10000000

In [154]:
%%time
df.count()

CPU times: user 851 µs, sys: 1.38 ms, total: 2.23 ms
Wall time: 76 ms


10000000

In [147]:
df.show()

+---+------+
| id|square|
+---+------+
|  0|     0|
|  1|     1|
|  2|     4|
|  3|     9|
|  4|    16|
|  5|    25|
|  6|    36|
|  7|    49|
|  8|    64|
|  9|    81|
| 10|   100|
| 11|   121|
| 12|   144|
| 13|   169|
| 14|   196|
| 15|   225|
| 16|   256|
| 17|   289|
| 18|   324|
| 19|   361|
+---+------+
only showing top 20 rows



In [155]:
df1.createOrReplaceTempView('dfTable')

In [156]:
spark.sql('cache table dfTable')

DataFrame[]

In [157]:
spark.sql('select count(*) from dfTable').show()

+--------+
|count(1)|
+--------+
|10000000|
+--------+

