# initialization

In [None]:
# Local path to gluten jar.
gluten_target_jar='/opt/gluten/jars/gluten-velox-bundle.jar'

# Select workload. Can be either 'tpch' or 'tpcds'.
workload='tpch'

# Run with gluten. If False, run Spark.
run_gluten=True

# TPC tables
#tpch_tabledir='/opt/spark/database/tpch_sf10_parquet_zstd'
#tpcds_tabledir='/opt/spark/database/tpch_sf10_parquet_zstd'

tabledir = ''

# Database name. if it's set, use the database instead loading table from tabledir
database = 'tpch_sf10_parquet_zstd'

# TPC queries
#tpch_query_path='/opt/spark/tpch-queries'
#tpcds_query_path='/opt/spark/tpcds-queries'

tpc_query_path = '/opt/spark/work-dir/tpc-h-300-queries/'

# Parallelism
executors_per_node=1

#gluten_tpch_task_per_core=2
#gluten_tpcds_task_per_core=4
#spark_tpch_task_per_core=8
#spark_tpcds_task_per_core=8

task_per_core=2

# Offheap ratio. 0 to disable offheap for Spark.
# onheap:offheap = 1:2
#spark_offheap_ratio=2.0
# onheap:offheap = 1:7
#gluten_offheap_ratio=7.0

offheap_ratio = 7.0

# spark.io.compression.codec
spark_codec='lz4'
# spark.gluten.sql.columnar.shuffle.codec
gluten_codec='lz4'

In [None]:
%env PYSPARK_SUBMIT_ARGS=--driver-java-options -Dio.netty.tryReflectionSetAccessible=true --conf spark.executor.extraJavaOptions=-Dio.netty.tryReflectionSetAccessible=true pyspark-shell

In [None]:
%run /opt/spark/work-dir/ipython/native_sql_initialize.ipynb

# Application Level Configuration

In [None]:
if run_gluten:
    sct=GlutenSparkContext(executors_per_node, task_per_core, gluten_target_jar, offheap_ratio)
    sct.conf.set('spark.gluten.sql.columnar.shuffle.codec', gluten_codec)
else:
    sct=VanillaSparkContext(executors_per_node, task_per_core, gluten_target_jar, offheap_ratio)
    sct.conf.set('spark.io.compression.codec', spark_codec)

In [None]:
if workload.lower()=="tpch":
    bm=TPCHBenchmark(sct, tabledir, 'parquet', tpc_query_path)
else:
    bm=TPCDSBenchmark(sct, tabledir, 'parquet', tpc_query_path)
bm.initialize()

# Run Workload

In [None]:
if database!="":
    load_table=False
    bm.sct.spark.sql("use " + database)
else:
    load_table=True
    if tabledir=="":
        raise "Either database or tabledir should be set"

In [None]:
bm.test_tpc.power_run(explain=False, print_result=False, load_table=load_table, action=lambda df: df.collect())

In [None]:
bm.test_tpc.print_result()

In [None]:
bm.collect_profile()