# initialization

In [None]:
# Local path to gluten project.
gluten_home='/home/sparkuser/gluten'

# Local path to gluten jar.
gluten_target_jar='/home/sparkuser/gluten-velox-bundle-spark3.3_2.12-centos_7_x86_64-1.3.0-SNAPSHOT.jar'

# Spark app master. e.g. 'yarn'
master='yarn'

# List of workers.
clients=['localhost']

# List of block devices. e.g. ['nvme1n1', 'nvme2n1']
disk_dev=[]

# List of network devices. e.g. ['ens787f0']
nic_dev=[]

# Select workload. Can be either 'tpch' or 'tpcds'.
workload='tpch'

# Run with gluten. If False, run Spark.
run_gluten=True

# TPC tables
tpch_tabledir=''
tpcds_tabledir=''

# Parallelism
executors_per_node=32
cores_per_executor=7

gluten_tpch_task_per_core=2
gluten_tpcds_task_per_core=4
spark_tpch_task_per_core=8
spark_tpcds_task_per_core=8

# Physical memory on each worker node.
memory_per_node='1000g'

# Offheap ratio. 0 to disable offheap for Spark.
# onheap:offheap = 1:2
spark_offheap_ratio=2.0
# onheap:offheap = 1:7
gluten_offheap_ratio=7.0

# spark.io.compression.codec
spark_codec='lz4'
# spark.gluten.sql.columnar.shuffle.codec
gluten_codec='lz4'
# spark.gluten.sql.columnar.shuffle.codecBackend
gluten_codec_backend=''
# spark.gluten.sql.columnar.maxBatchSize
max_batch_size=4096
# spark.app.name, empty to use default name.
app_name=''

# Hostname or IP to server for perf analysis. Able to connect via ssh.
server=''

# Gluten home on server.
server_gluten_home='/home/sparkuser/gluten'

# Specify the directory on perf analysis server. Usually a codename for this run.
base_dir=''

# Proxy used to connect to server for perf analysis.
proxy=''

# Emon event file for `emon -i`. Set to emptry string '' if emon is unavailable.
# Supported emon events on platform can be verified via `emon -i emon.list`
emon_list=''

# Whether to run perf analysis scripts. Only takes effect if server is set.
analyze_perf=False

# List of email to receive perf analysis results.
emails = []

# Pull request number.
pr=''

In [None]:
initialize_ipynb = !realpath native_sql_initialize.ipynb
print(f"Running notebook: {initialize_ipynb[0]}\n")
%run {initialize_ipynb[0]}

In [None]:
newClients = []
for l in clients:
    if l == 'localhost':
        newClients.append(localhost)
    else:
        newClients.append(l)
clients = newClients

if server == 'localhost':
    server = localhost

In [None]:
if not app_name and run_gluten:
    if pr.isdigit():
        app_name = f'PR{pr}'
    elif not pr:
        app_name = 'main'

In [None]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
nb_name=PAPERMILL_OUTPUT_PATH

# Application Level Configuration

In [None]:
tpch_workload=False
tpcds_workload=False

if workload.lower() == 'tpch':
    tpch_workload=True
elif workload.lower() == 'tpcds':
    tpcds_workload=True
else:
    raise ValueError(f"Unknown workload: {workload}")

def gluten_conf_overwrite(conf):
    conf.set('spark.gluten.sql.columnar.shuffle.codec', gluten_codec)\
        .set('spark.gluten.sql.columnar.shuffle.codecBackend', gluten_codec_backend)\
        .set('spark.gluten.sql.columnar.maxBatchSize', max_batch_size)\
        .set('spark.executor.extraJavaOptions',\
            '-XX:+UseParallelOldGC -XX:ParallelGCThreads=2 -XX:NewRatio=1 -XX:SurvivorRatio=1 -XX:+UseCompressedOops -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:ErrorFile=/home/sparkuser/logs/java/hs_err_pid%p.log')\
        .set('spark.gluten.memory.overAcquiredMemoryRatio','0')\

    if tpch_workload:
        pass
    elif tpcds_workload:
        pass
    return conf

def spark_conf_overwrite(conf):
    conf.set('spark.io.compression.codec', spark_codec)\
        .set('spark.executorEnv.LD_LIBRARY_PATH',f"{os.getenv('HADOOP_HOME')}/lib/native/") \
        .set('spark.yarn.appMasterEnv.LD_LIBRARY_PATH',f"{os.getenv('HADOOP_HOME')}/lib/native/") \

    if tpch_workload:
        pass
    elif tpcds_workload:
        pass
    return conf

def app_conf_overwrite(conf):
    if run_gluten:
        return gluten_conf_overwrite(conf)
    return spark_conf_overwrite(conf)

# Run Workload

In [None]:
# Config and clean pagecache before each run
config_pagecache(clients, run_gluten)
dropcache(clients)
print_kernel_params(clients)

In [None]:
# Create SparkSession
sc, spark, appid, test_tpc=create_cntx(run_gluten, workload, app_conf_overwrite, server, base_dir, nb_name, app_name)

In [None]:
if run_gluten:
    config_mem_cgroup(clients)

In [None]:
test_tpc.start_monitor(clients, emon_list=emon_list)

In [None]:
test_tpc.power_run(explain=False, print_result=False, load_table=True)

In [None]:
test_tpc.stop_monitor(clients)

In [None]:
if analyze_perf:
    test_tpc.run_perf_analysis(server_gluten_home, disk_dev, nic_dev, proxy, emails, pr)

# Show Performance

In [None]:
test_tpc.print_result()

In [None]:
for client in clients:
    draw_sar(appid, qtime=test_tpc.result, disk_dev=disk_dev, nic_dev=nic_dev, client=client)