In [1]:
from pyspark.sql import SparkSession
import os
import pandas as pd
import time
import string
import pathlib
import random
import threading
import time
from urllib.parse import urlsplit, urlunsplit
import requests
import json
from py4j.protocol import Py4JJavaError, Py4JError
import glob

In [2]:
# Global configuration
SPARK_MEMORY = 900
SPARK_CORES = 60
DBHOST = 'postgres'
group_in_leaves = False
QUERY_TIMEOUT = 60 * 30

In [3]:
def create_spark():
    spark = SparkSession.builder \
        .appName("app") \
        .master(f'local[{SPARK_CORES}]') \
        .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
        .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
    if (group_in_leaves):
        spark.sql("SET spark.sql.yannakakis.countGroupInLeaves = true").show()
    return spark

In [4]:
def extract_metrics(spark, group_id):
    parsed = list(urlsplit(spark.sparkContext.uiWebUrl))
    host_port = parsed[1]
    parsed[1] = 'localhost' + host_port[host_port.find(':'):]
    API_URL = f'{urlunsplit(parsed)}/api/v1'

    app_id = spark.sparkContext.applicationId
    sql_queries = requests.get(API_URL + f'/applications/{app_id}/sql', params={'length': '100000'}).json()
    query_ids = [q['id'] for q in sql_queries if q['description'] == group_id]
    if (len(query_ids) == 0):
        print(f'query with group {group_id} not found')
        return None
    query_id = query_ids[0]
    print(f'query id: {query_id}')
    
    query_details = requests.get(API_URL + f'/applications/{app_id}/sql/{query_id}',
                                 params={'details': 'true', 'planDescription': 'true'}).json()
    
    success_job_ids = query_details['successJobIds']
    running_job_ids = query_details['runningJobIds']
    failed_job_ids = query_details['failedJobIds']
    
    job_ids = success_job_ids + running_job_ids + failed_job_ids
    
    job_details = [requests.get(API_URL + f'/applications/{app_id}/jobs/{jid}').json() for jid in job_ids]
    
    job_stages = {}
    
    for j in job_details:
        stage_ids = j['stageIds']
        
        stage_params = {'details': 'true', 'withSummaries': 'true'}
        stages = [requests.get(API_URL + f'/applications/{app_id}/stages/{sid}', stage_params) for sid in stage_ids]
        
        job_stages[j['jobId']] = [stage.json() for stage in stages if stage.status_code == 200] # can be 404
    
    return query_details, job_details, job_stages

In [5]:
def import_db(spark, dbname):
    
    username = dbname
    password = dbname
    dbname = dbname

    df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

    for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            #print(df.show())
            df.createOrReplaceTempView(table_name)

def random_str(size=16, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

def set_group_id(spark):
    group_id = random_str()
    spark.sparkContext.setJobGroup(group_id, group_id)
    return group_id

def cancel_query(seconds, group_id):
    time.sleep(seconds)
    print("cancelling jobs with id " + group_id)
    print(spark.sparkContext.cancelJobGroup(group_id))
    print("cancelled job")

def cancel_query_after(spark, seconds):
    group_id = random_str()
    spark.sparkContext.setJobGroup(group_id, group_id)
    threading.Thread(target=cancel_query, args=(seconds,group_id,)).start()
    return group_id
    
def run_query(file):
    with open(file, 'r') as f:
        query = '\n'.join(filter(lambda line: not line.startswith('limit') and not line.startswith('-'), f.readlines()))
        
        print("running query: \n" + query)
        return spark.sql(query)

In [6]:
def benchmark_query(spark, query, respath, run):
    start_time = time.time()

    group_id = cancel_query_after(spark, QUERY_TIMEOUT)
    df1 = run_query(query)
    df1.show()

    end_time = time.time()
    diff_time = end_time - start_time

    execution, jobs, job_stages = extract_metrics(spark, group_id)

    if execution is not None:
            with open(respath + f'/execution-{run}.json', 'w') as f:
                f.write(json.dumps(execution, indent=2))
            with open(respath + f'/jobs-{run}.json', 'w') as f:
                f.write(json.dumps(jobs, indent=2))
            with open(respath + f'/stages-{run}.json', 'w') as f:
                f.write(json.dumps(job_stages, indent=2))
    return diff_time

def benchmark(spark, dbname, query_file, mode, run):
    #spark.sql("SET spark.sql.yannakakis.enabled = false").show()
    # run the query once to warm up Spark (load the relation in memory)
    #df0 = run_query(query)
    #df0.show()
    
    query_name = os.path.basename(query_file)

    respath = f'benchmark-results-{dbname}/' + query_name + "/" + mode
    pathlib.Path(respath).mkdir(parents=True, exist_ok=True)

    if mode == "opt":
        spark.sql("SET spark.sql.yannakakis.enabled = true").show()
    elif mode == "ref":
        spark.sql("SET spark.sql.yannakakis.enabled = false").show()
    else:
        return []

    try:
        runtime = benchmark_query(spark, query_file, respath, run)
        return [query_name, runtime, mode, run]
    except Py4JError as e:
        print('timeout or error: ' + str(e))
        return [query_name, None, mode, run]

## SNAP Benchmark

In [None]:
### benchmark configuration
dbname = 'snap'
tablename = 'wiki'
mode = 'ref'
runs = [1]
###

spark = create_spark()
import_db(spark, dbname)

queries = ['snap-queries/patents/path02.sql',
          'snap-queries/patents/path03.sql',
          'snap-queries/patents/path04.sql',
          'snap-queries/patents/path05.sql',
          'snap-queries/patents/path06.sql',
          'snap-queries/patents/path07.sql',
          'snap-queries/patents/path08.sql',
          'snap-queries/patents/tree01.sql',
          'snap-queries/patents/tree02.sql',
          'snap-queries/patents/tree03.sql']

queries = sorted(glob.glob(f'snap-queries/{tablename}/*'))
#queries = sorted(glob.glob(f'snap-queries/{tablename}/tree*'))
#queries = ['snap-queries/patents/tree01.sql']

print('running queries: ' + str(queries))

results_df = df = pd.DataFrame([], columns = ['query', 'runtime', 'mode', 'run'])
results_file = f'benchmark-results-{dbname}/results-{mode}.csv'
if (os.path.exists(results_file)):
    results_df = pd.read_csv(results_file)
    print(results_file)

for run in runs:
    for q in queries:
        results = [benchmark(spark, dbname, q, mode, run)]
        new_df = pd.DataFrame(results, columns = ['query', 'runtime', 'mode', 'run'])
        results_df = pd.concat([results_df, new_df])
        results_df.to_csv(f'benchmark-results-{dbname}/results-{mode}.csv')
        print(results_df)

23/11/15 13:28:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 13:28:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


patents
wiki
google
dblp
running queries: ['snap-queries/wiki/path02.sql', 'snap-queries/wiki/path03.sql', 'snap-queries/wiki/path04.sql', 'snap-queries/wiki/path05.sql', 'snap-queries/wiki/path06.sql', 'snap-queries/wiki/path07.sql', 'snap-queries/wiki/path08.sql', 'snap-queries/wiki/tree01.sql', 'snap-queries/wiki/tree02.sql', 'snap-queries/wiki/tree03.sql']
benchmark-results-snap/results-ref.csv
+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...|false|
+--------------------+-----+

running query: 
select count(*) from wiki p1, wiki p2, wiki p3 where p1.toNode = p2.fromNode AND p2.toNode = p3.fromNode



                                                                                

+------------+
|    count(1)|
+------------+
|238984482772|
+------------+

query id: 7
   Unnamed: 0       query     runtime mode  run
0         0.0  tree01.sql  145.612812  ref  NaN
0         NaN  path02.sql  514.217696  ref  1.0
+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...|false|
+--------------------+-----+

running query: 
select count(*) from wiki p1, wiki p2, wiki p3, wiki p4 where p1.toNode = p2.fromNode AND p2.toNode = p3.fromNode AND p3.toNode = p4.fromNode





cancelling jobs with id HDZ3USFX1NC6GGFC
None
cancelled job


[Stage 33:>                                                      (0 + 60) / 200]

cancelling jobs with id ZN77JI18TJTOR547
None
cancelled job
timeout or error: An error occurred while calling o93.showString.
: org.apache.spark.SparkException: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2719)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1193)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1192)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3004)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.

23/11/15 14:07:19 WARN TaskSetManager: Lost task 23.0 in stage 33.0 (TID 300) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547)
23/11/15 14:07:19 WARN TaskSetManager: Lost task 40.0 in stage 33.0 (TID 317) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547)
23/11/15 14:07:19 WARN TaskSetManager: Lost task 50.0 in stage 33.0 (TID 327) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547)
23/11/15 14:07:19 WARN TaskSetManager: Lost task 7.0 in stage 33.0 (TID 284) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547)
23/11/15 14:07:19 WARN TaskSetManager: Lost task 41.0 in stage 33.0 (TID 318) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18T

+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...|false|
+--------------------+-----+

running query: 
select count(*) from wiki p1, wiki p2, wiki p3, wiki p4, wiki p5 where p1.toNode = p2.fromNode AND p2.toNode = p3.fromNode AND p3.toNode = p4.fromNode AND p4.toNode = p5.fromNode



23/11/15 14:07:23 WARN TaskSetManager: Lost task 35.0 in stage 33.0 (TID 312) (389ea96bb8b5 executor driver): TaskKilled (Stage cancelled: Job 12 cancelled part of cancelled job group ZN77JI18TJTOR547)
ERROR:root:Exception while sending command.                      (27 + 40) / 67]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=58>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception whi

timeout or error: An error occurred while calling o102.showString
   Unnamed: 0       query     runtime mode  run
0         0.0  tree01.sql  145.612812  ref  NaN
0         NaN  path02.sql  514.217696  ref  1.0
0         NaN  path03.sql         NaN  ref  1.0
0         NaN  path04.sql         NaN  ref  1.0
+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...|false|
+--------------------+-----+

running query: 
select count(*) from wiki p1, wiki p2, wiki p3, wiki p4, wiki p5, wiki p6 where p1.toNode = p2.fromNode AND p2.toNode = p3.fromNode AND p3.toNode = p4.fromNode AND p4.toNode = p5.fromNode AND p5.toNode = p6.fromNode;



ERROR:root:Exception while sending command.+ 1) / 1][Stage 45:>   (0 + 1) / 1]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=58>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1

timeout or error: An error occurred while calling o108.showString
   Unnamed: 0       query     runtime mode  run
0         0.0  tree01.sql  145.612812  ref  NaN
0         NaN  path02.sql  514.217696  ref  1.0
0         NaN  path03.sql         NaN  ref  1.0
0         NaN  path04.sql         NaN  ref  1.0
0         NaN  path05.sql         NaN  ref  1.0
+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...|false|
+--------------------+-----+

running query: 
select count(*) from wiki p1, wiki p2, wiki p3, wiki p4, wiki p5, wiki p6, wiki p7 where p1.toNode = p2.fromNode AND p2.toNode = p3.fromNode AND p3.toNode = p4.fromNode AND p4.toNode = p5.fromNode AND p5.toNode = p6.fromNode AND p6.toNode = p7.fromNode;



## LSQB Benchmark

In [None]:
#spark.conf.set("spark.sql.legacy.setCommandRejectsSparkCoreConfs","false")
#spark.conf.set("spark.executor.cores", "1")
#spark.conf.set("spark.executor.instances", "1")
spark.conf.set("spark.sql.shuffle.partitions", "1")