In [1]:
from pyspark.sql import SparkSession
import os
import pandas as pd
import time
import string
import pathlib
import random
import threading
import time
from urllib.parse import urlsplit, urlunsplit
import requests
import json
from py4j.protocol import Py4JJavaError, Py4JError
import glob
import psutil

In [2]:
# Global configuration
#SPARK_MEMORY = 900
SPARK_MEMORY = 16
#SPARK_CORES = 60
SPARK_CORES = 8
DBHOST = 'postgres'
QUERY_TIMEOUT = 60 * 30
QUERY_TIMEOUT = 60 * 180

In [3]:
def create_spark():
    spark = SparkSession.builder \
        .appName("app") \
        .master(f'local[{SPARK_CORES}]') \
        .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
        .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
    return spark

## Cluster
# def create_spark():
#     spark = SparkSession.builder \
#         .appName("app") \
#         .master('spark://10.100.42.35:7078') \
#         .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
#         .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
#         .config("spark.driver.host", "10.100.42.223") \
#         .config("spark.driver.bindAddress", "0.0.0.0") \
#         .config("spark.driver.port", "4060") \
#         .config("spark.memory.offHeap.enabled",OFFHEAP) \
#         .config("spark.jars", "postgresql-42.3.3.jar") \
#         .getOrCreate()
#     return spark

In [4]:
def extract_metrics(spark, group_id):
    parsed = list(urlsplit(spark.sparkContext.uiWebUrl))
    host_port = parsed[1]
    parsed[1] = 'localhost' + host_port[host_port.find(':'):]
    API_URL = f'{urlunsplit(parsed)}/api/v1'

    app_id = spark.sparkContext.applicationId
    sql_queries = requests.get(API_URL + f'/applications/{app_id}/sql', params={'length': '100000'}).json()
    query_ids = [q['id'] for q in sql_queries if q['description'] == group_id]
    if (len(query_ids) == 0):
        print(f'query with group {group_id} not found')
        return None
    query_id = query_ids[0]
    print(f'query id: {query_id}')
    
    query_details = requests.get(API_URL + f'/applications/{app_id}/sql/{query_id}',
                                 params={'details': 'true', 'planDescription': 'true'}).json()
    
    success_job_ids = query_details['successJobIds']
    running_job_ids = query_details['runningJobIds']
    failed_job_ids = query_details['failedJobIds']
    
    job_ids = success_job_ids + running_job_ids + failed_job_ids
    
    job_details = [requests.get(API_URL + f'/applications/{app_id}/jobs/{jid}').json() for jid in job_ids]
    
    job_stages = {}
    
    for j in job_details:
        stage_ids = j['stageIds']
        
        stage_params = {'details': 'true', 'withSummaries': 'true'}
        stages = [requests.get(API_URL + f'/applications/{app_id}/stages/{sid}', stage_params) for sid in stage_ids]
        
        job_stages[j['jobId']] = [stage.json() for stage in stages if stage.status_code == 200] # can be 404
    
    return query_details, job_details, job_stages

In [5]:
def import_db(spark, dbname):
    
    username = dbname
    password = dbname
    dbname = dbname

    df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

    for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            #print(df.show())
            df.createOrReplaceTempView(table_name)

def random_str(size=16, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

def set_group_id(spark):
    group_id = random_str()
    spark.sparkContext.setJobGroup(group_id, group_id)
    return group_id

def cancel_query(spark, seconds, group_id):
    time.sleep(seconds)
    print("cancelling jobs with id " + group_id)
    print(spark.sparkContext.cancelJobGroup(group_id))
    print("cancelled job")

def cancel_query_after(spark, seconds):
    group_id = random_str()
    spark.sparkContext.setJobGroup(group_id, group_id)
    threading.Thread(target=cancel_query, args=(spark, seconds, group_id,)).start()
    return group_id
    
def run_query(spark, file):
    with open(file, 'r') as f:
        query = '\n'.join(filter(lambda line: not line.startswith('limit') and not line.startswith('-'), f.readlines()))
        
        print("running query: \n" + query)
        return spark.sql(query)

def get_resource_usage(t):
    return {
        'time': t,
        'memory': psutil.virtual_memory(),
        'cpu': psutil.cpu_percent(interval=None, percpu=True),
        'cpu_total': psutil.cpu_percent(interval=None, percpu=False)
    }
def explain_str(df):
    return df._sc._jvm.PythonSQLUtils.explainString(df._jdf.queryExecution(), 'extended')

In [6]:
resource_usage = []

def measure_resource_usage(resource_usage):
    t = threading.current_thread()
    secs = 0
    while getattr(t, "do_run", True):
        resource_usage.append(get_resource_usage(secs))
        #print("resource usage: " + str(resource_usage))
        secs += 1
        time.sleep(1)

def benchmark_query(spark, query, respath, run):
    spark.sparkContext._jvm.System.gc()
    start_time = time.time()

    resource_usage = []

    measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
    measure_thread.start()

    group_id = cancel_query_after(spark, QUERY_TIMEOUT)
    df1 = run_query(spark, query)
    df1.show()

    measure_thread.do_run = False

    end_time = time.time()
    diff_time = end_time - start_time

    execution, jobs, job_stages = extract_metrics(spark, group_id)

    with open(respath + f'/resource-usage-{run}.json', 'w') as f:
        f.write(json.dumps(resource_usage, indent=2))
    with open(respath + f'/explain-{run}.txt', 'w') as f:
        f.write(explain_str(df1))

    resource_list = map(lambda r: [r['time'], r['memory'].used, r['cpu_total']], resource_usage)
    resource_df = pd.DataFrame(resource_list, columns = ['time', 'memory_used', 'cpu_used'])
    resource_df.to_csv(respath + f'/resource-usage-{run}.csv')

    peak_memory = max(map(lambda r: r['memory'].used, resource_usage)) / (1000 * 1000 * 1000) # GB

    if execution is not None:
            with open(respath + f'/execution-{run}.json', 'w') as f:
                f.write(json.dumps(execution, indent=2))
            with open(respath + f'/jobs-{run}.json', 'w') as f:
                f.write(json.dumps(jobs, indent=2))
            with open(respath + f'/stages-{run}.json', 'w') as f:
                f.write(json.dumps(job_stages, indent=2))
    return (diff_time, peak_memory)

def benchmark(spark, dbname, query_file, mode, run):
    #spark.sql("SET spark.sql.yannakakis.enabled = false").show()
    # run the query once to warm up Spark (load the relation in memory)
    #df0 = run_query(query)
    #df0.show()
    
    query_name = os.path.basename(query_file)

    respath = f'benchmark-results-{dbname}/' + query_name + "/" + mode
    pathlib.Path(respath).mkdir(parents=True, exist_ok=True)

    if mode == "opt":
        spark.sql("SET spark.sql.yannakakis.enabled = true").show()
    elif mode == "ref":
        spark.sql("SET spark.sql.yannakakis.enabled = false").show()
    else:
        return []

    try:
        (runtime, peak_memory) = benchmark_query(spark, query_file, respath, run)
        return [query_name, runtime, peak_memory, mode, run]
    except Py4JError as e:
        print('timeout or error: ' + str(e))
        return [query_name, None, None, mode, run]

def benchmark_all(dbname, mode, runs, queries, group_in_leaves=False, physical_cj=False, enable_unguarded=False):
    spark = create_spark()
    import_db(spark, dbname)

    if physical_cj:
        spark.sql("SET spark.sql.codegen.wholeStage = true").show()
        spark.sql("SET spark.sql.yannakakis.physicalCountJoinEnabled = true").show()
    else:
        spark.sql("SET spark.sql.codegen.wholeStage = true").show()
        spark.sql("SET spark.sql.yannakakis.physicalCountJoinEnabled = false").show()
    if group_in_leaves:
        spark.sql("SET spark.sql.yannakakis.countGroupInLeaves = true").show()
    else:
        spark.sql("SET spark.sql.yannakakis.countGroupInLeaves = false").show()
    if enable_unguarded:
        spark.sql("SET spark.sql.yannakakis.unguardedEnabled = true").show()
    else:
        spark.sql("SET spark.sql.yannakakis.unguardedEnabled = false").show()

    results_df = df = pd.DataFrame([], columns = ['query', 'runtime', 'peak_memory', 'mode', 'run', 'group_leaves', 'physical_cj'])
    results_file = f'benchmark-results-{dbname}/results-{mode}.csv'
    if (os.path.exists(results_file)):
        results_df = pd.read_csv(results_file, index_col=0)

    for run in runs:
        for q in queries:
            results = [benchmark(spark, dbname, q, mode, run) + [group_in_leaves, physical_cj]]
            new_df = pd.DataFrame(results, columns = ['query', 'runtime', 'peak_memory', 'mode', 'run', 'group_leaves', 'physical_cj'])
            results_df = pd.concat([results_df, new_df], ignore_index=True)
            results_df.to_csv(f'benchmark-results-{dbname}/results-{mode}.csv')
            print(results_df)
    

In [10]:
spark = create_spark()

def show_all_table_columns():
    spark = SparkSession.builder \
        .appName("ShowTableColumns") \
        .getOrCreate()
    
    # Get all tables in the database
    tables = spark.sql(f"SHOW TABLES").collect()

    for table_row in tables:
        table_name = table_row['tableName']

        print(f"\nTable: {table_name}")
        print("-" * 50)

        try:
            # Get column information
            columns = spark.sql(f"DESCRIBE TABLE {table_name}").collect()

            for col in columns:
                col_name = col['col_name']
                data_type = col['data_type']
                comment = col['comment'] if col['comment'] else ''
                print(f"  {col_name:<30} {data_type:<20} {comment}")

        except Exception as e:
            print(f"  Error describing table: {e}")

show_all_table_columns()

25/09/19 19:27:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.



Table: aka_name
--------------------------------------------------
  id                             int                  
  person_id                      int                  
  name                           string               
  imdb_index                     string               
  name_pcode_cf                  string               
  name_pcode_nf                  string               
  surname_pcode                  string               
  md5sum                         string               

Table: aka_title
--------------------------------------------------
  id                             int                  
  movie_id                       int                  
  title                          string               
  imdb_index                     string               
  kind_id                        int                  
  production_year                int                  
  phonetic_code                  string               
  episode_of_id                  int  

In [12]:
spark.sql("SHOW TABLES").show()

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|         |       aka_name|       true|
|         |      aka_title|       true|
|         |      cast_info|       true|
|         |      char_name|       true|
|         | comp_cast_type|       true|
|         |   company_name|       true|
|         |   company_type|       true|
|         |  complete_cast|       true|
|         |      info_type|       true|
|         |        keyword|       true|
|         |      kind_type|       true|
|         |      link_type|       true|
|         |movie_companies|       true|
|         |     movie_info|       true|
|         |  movie_keyword|       true|
|         |     movie_link|       true|
|         |           name|       true|
|         |    person_info|       true|
|         |      role_type|       true|
|         |          title|       true|
+---------+---------------+-----------+



## SNAP Benchmark

### Optimized execution

In [None]:
#### benchmark configuration
group_in_leaves = False
dbname = 'snap'
mode = 'opt'
runs = ['1', '2', '3', '4', '5', '6']
#runs = ['1']
####

tables = ['patents', 'wiki', 'google', 'dblp']
#tables = ['wiki']


for tablename in tables:
    queries = sorted(glob.glob(f'snap-queries/all/{tablename}-*'))
    print('running queries: ' + str(queries))
    benchmark_all(dbname, mode, runs, queries, physical_cj=True)



### Ref execution

In [None]:
#### benchmark configuration
group_in_leaves = False
dbname = 'snap'
mode = 'ref'
runs = ['1', '2', '3', '4', '5', '6']
####

queries = ['snap-queries/all/patents-path02.sql',
          'snap-queries/all/patents-path03.sql',
          'snap-queries/all/patents-path04.sql',
          'snap-queries/all/patents-path05.sql',
          'snap-queries/all/patents-tree01.sql',
          'snap-queries/all/wiki-path02.sql',
           'snap-queries/all/google-path02.sql',
           'snap-queries/all/google-path03.sql',
           'snap-queries/all/google-path04.sql',
           'snap-queries/all/dblp-path02.sql',
           'snap-queries/all/dblp-path03.sql',
           'snap-queries/all/dblp-path04.sql',
           'snap-queries/all/dblp-path05.sql',
           'snap-queries/all/dblp-tree01.sql',
           'snap-queries/all/dblp-tree02.sql'
          ]


print('running queries: ' + str(queries))

benchmark_all(dbname, mode, runs, queries)

## LSQB Benchmark

In [None]:
#### benchmark configuration
dbname = 'lsqb'
group_in_leaves = False
physical_cj = True
#mode = 'opt'
runs = ['1', '2', '3', '4', '5', '6']
runs = ['1', '2']
####

queries = ['lsqb/sql/q1.sql', 'lsqb/sql/q4.sql']
queries_hints = ['lsqb/sql/q1-hint.sql', 'lsqb/sql/q4-hint.sql']

print('running queries: ' + str(queries))
#benchmark_all(dbname, 'opt', runs, queries, group_in_leaves=False, physical_cj=True)
#benchmark_all(dbname, 'opt', runs, queries_hints, group_in_leaves=False, physical_cj=True)

#benchmark_all(dbname, 'opt', ['1'], ['lsqb/sql/q4.sql', 'lsqb/sql/q4-hint.sql'], group_in_leaves=False, physical_cj=True)
#benchmark_all(dbname, 'opt', ['3', '4', '5', '6'], ['lsqb/sql/q4.sql', 'lsqb/sql/q4-hint.sql'], group_in_leaves=False, physical_cj=True)
#benchmark_all(dbname, 'opt', ['1', '2', '3', '4', '5', '6'], ['lsqb/sql/q4.sql', 'lsqb/sql/q4-hint.sql'], group_in_leaves=False, physical_cj=False)

#benchmark_all(dbname, 'opt', ['1', '2', '3', '4', '5', '6'], ['lsqb/sql/q1-hint.sql'], group_in_leaves=False, physical_cj=False)
#benchmark_all(dbname, 'opt', ['1', '2', '3', '4', '5', '6'], ['lsqb/sql/q1-hint.sql'], group_in_leaves=False, physical_cj=True)
#benchmark_all(dbname, 'opt', ['3', '4', '5', '6'], ['lsqb/sql/q1.sql'], group_in_leaves=False, physical_cj=False)
#benchmark_all(dbname, 'opt', ['3', '4', '5', '6'], ['lsqb/sql/q1.sql'], group_in_leaves=False, physical_cj=True)
benchmark_all(dbname, 'ref', ['4', '5', '6'], ['lsqb/sql/q1.sql'])


#benchmark_all(dbname, 'opt', runs, queries_hints, group_in_leaves=False, physical_cj=True)
#benchmark_all(dbname, 'ref', ['3', '4', '5', '6'], ['lsqb/sql/q4.sql'])

## TPC-H Benchmark

In [None]:
#### benchmark configuration
group_in_leaves = False
dbname = 'tpch'
runs = ['1', '2', '3', '4', '5', '6']
#runs = ['x1', 'x2']
####

queries = ['tpch-kit/dbgen/queries/postgres/2.sql',
           'tpch-kit/dbgen/queries/postgres/11.sql', 
           'tpch-kit/dbgen/queries/postgres/11-hint.sql']
queries += ['tpch-queries/median-1.sql', 'tpch-queries/median-1-hint.sql']

#queries = ['tpch-queries/median-1.sql', 'tpch-queries/median-1-hint.sql' ]
#queries = ['tpch-kit/dbgen/queries/postgres/11.sql', 
#           'tpch-kit/dbgen/queries/postgres/11-hint.sql']
#queries = ['tpch-queries/2-subq.sql'] #, 'tpch-queries/2-subq-hint.sql']

#queries = sorted(glob.glob('tpch-kit/dbgen/queries/*.sql'))

queries = ['tpch-kit/dbgen/queries/postgres/18.sql']

print('running queries: ' + str(queries))
#benchmark_all(dbname, 'ref', ['x'], queries)
benchmark_all(dbname, 'opt', ['x'], queries, group_in_leaves = group_in_leaves, physical_cj=True, enable_unguarded=True)
#benchmark_all(dbname, 'ref', ['3', '4', '5', '6'], queries)
#benchmark_all(dbname, 'opt', ['3', '4', '5', '6'], queries, group_in_leaves = group_in_leaves, physical_cj=True)
#benchmark_all(dbname, 'opt', ['1', '2', '3', '4', '5', '6'], queries, group_in_leaves = group_in_leaves, physical_cj=False)
#benchmark_all(dbname, 'opt', runs, queries, group_in_leaves = group_in_leaves, physical_cj=False)


In [None]:
## JOB (IMDB) Benchmark

In [9]:
spark.sql("SHOW TABLES")

NameError: name 'spark' is not defined

In [None]:
#### benchmark configuration
group_in_leaves = False
dbname = 'imdb'
runs = ['1', '2', '3', '4', '5', '6']
####

queries = ['job/2a.sql', 'job/2b.sql', 'job/2c.sql', 'job/2d.sql',
           'job/3a.sql', 'job/3b.sql', 'job/3c.sql',
           'job/5a.sql', 'job/5b.sql', 'job/5c.sql',
           'job/17a.sql', 'job/17b.sql', 'job/17c.sql', 'job/17d.sql', 'job/17e.sql', 'job/17f.sql',
           'job/20a.sql', 'job/20b.sql', 'job/20c.sql',
          ]

queries = sorted(glob.glob('job/unguarded/*-unguarded-7.sql'))

#queries = ['job/29a-unguarded.sql']
#queries = ['job/21a-unguarded.sql']

print('running queries: ' + str(queries))
benchmark_all(dbname, 'opt', ["1", "2"], queries, physical_cj=True, enable_unguarded=True)
#benchmark_all(dbname, 'opt', ['1'], queries, physical_cj=True, enable_unguarded=True)

#benchmark_all(dbname, 'ref', runs, queries)
benchmark_all(dbname, 'ref', ["1", "2"], queries)

running queries: ['job/unguarded/11a-unguarded-7.sql', 'job/unguarded/12a-unguarded-7.sql', 'job/unguarded/13a-unguarded-7.sql', 'job/unguarded/14a-unguarded-7.sql', 'job/unguarded/15a-unguarded-7.sql', 'job/unguarded/16a-unguarded-7.sql', 'job/unguarded/19a-unguarded-7.sql', 'job/unguarded/20a-unguarded-7.sql', 'job/unguarded/21a-unguarded-7.sql', 'job/unguarded/22a-unguarded-7.sql', 'job/unguarded/23a-unguarded-7.sql', 'job/unguarded/24a-unguarded-7.sql', 'job/unguarded/25a-unguarded-7.sql', 'job/unguarded/26a-unguarded-7.sql', 'job/unguarded/27a-unguarded-7.sql', 'job/unguarded/28a-unguarded-7.sql', 'job/unguarded/29a-unguarded-7.sql', 'job/unguarded/30a-unguarded-7.sql', 'job/unguarded/31a-unguarded-7.sql', 'job/unguarded/33a-unguarded-7.sql', 'job/unguarded/7a-unguarded-7.sql', 'job/unguarded/9a-unguarded-7.sql']
aka_name
aka_title
cast_info
char_name
comp_cast_type
company_name
company_type
complete_cast
info_type
keyword
kind_type
link_type
movie_companies
movie_info
movie_keywo

25/09/22 15:10:24 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, phonetic_code#80961, id#80973, id#80993, id#80999, md5sum#81050], [toprettystring(min(name#80930), Some(GMT)) AS toprettystring(from_company)#81216, toprettystring(min(link#80970), Some(GMT)) AS toprettystring(movie_link_type)#81217, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(non_polish_sequel_movie)#81218, toprettystring(name_pcode_nf#80933, Some(GMT)) AS toprettystring(name_pcode_nf)#81219, toprettystring(kind#80944, Some(GMT)) AS toprettystring(kind)#81220, toprettystring(phonetic_code#80961, Some(GMT)) AS toprettystring(phonetic_code)#81221, toprettystring(id#80973, Some(GMT)) AS toprettystring(id)#81222, toprettystring(id#80993, Some(GMT)) AS toprettystring(id)#81223, toprettystring(id#80999, Some(GMT)) AS toprettystring(id)#81224, toprettystring(md5sum#81050, Some(GMT)) AS toprettystring(md5sum)#81225]
+- Project [name#80930, name_pcode_nf#809

+--------------------+---------------+-----------------------+-------------+--------------------+-------------+-------+-------+------+--------------------+
|        from_company|movie_link_type|non_polish_sequel_movie|name_pcode_nf|                kind|phonetic_code|     id|     id|    id|              md5sum|
+--------------------+---------------+-----------------------+-------------+--------------------+-------------+-------+-------+------+--------------------+
|    AIC Animate Film|    followed by|           Tenchi Muyô!|        A2531|production companies|          S24|3293196|1515689|474231|55d5a3904f433c2ac...|
|    AIC Animate Film|    followed by|           Tenchi Muyô!|        A2531|production companies|          S24|3293196|1515689|474232|55d5a3904f433c2ac...|
|    AIC Animate Film|    followed by|           Tenchi Muyô!|        A2531|production companies|          S24|3293196|1515689|474233|55d5a3904f433c2ac...|
|      Atlantis Films|    followed by|      The Twilight Zone|  

25/09/22 15:10:34 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, phonetic_code#80961, id#80973, id#80993, id#80999, md5sum#81050], [min(name#80930) AS from_company#81190, min(link#80970) AS movie_link_type#81191, min(title#81040) AS non_polish_sequel_movie#81192, name_pcode_nf#80933, kind#80944, phonetic_code#80961, id#80973, id#80993, id#80999, md5sum#81050]
+- Project [name#80930, name_pcode_nf#80933, kind#80944, phonetic_code#80961, link#80970, id#80973, id#80993, id#80999, title#81040, md5sum#81050]
   +- Join Inner, (((movie_id#81000 = id#81039) AND (id#81039 = movie_id#80994)) AND (id#81039 = movie_id#80974))
      :- Project [name#80930, name_pcode_nf#80933, id#80973, movie_id#80974, kind#80944, id#80993, movie_id#80994, phonetic_code#80961, id#80999, movie_id#81000, link#80970]
      :  +- Join Inner, (id#80969 = link_type_id#81002)
      :     :- Join Inner, ((movie_id#81000 = movie_id#80994) AND (movie_id#81000 = movie_id

                   query    runtime  peak_memory mode run  group_leaves  \
0    10a-unguarded-2.sql   7.204246    16.099049  opt   1         False   
1    11a-unguarded-2.sql   5.739406    16.705421  opt   1         False   
2    12a-unguarded-2.sql  22.504743    20.701471  opt   1         False   
3    13a-unguarded-2.sql  38.679313    27.703075  opt   1         False   
4    14a-unguarded-2.sql   8.781767    26.621153  opt   1         False   
..                   ...        ...          ...  ...  ..           ...   
285  31a-unguarded-6.sql  30.779052    20.115042  opt   2         False   
286  33a-unguarded-6.sql  32.043667    22.829953  opt   2         False   
287   7a-unguarded-6.sql  40.177852    23.151915  opt   2         False   
288   9a-unguarded-6.sql  12.481005    23.160967  opt   2         False   
289  11a-unguarded-7.sql   9.048557    15.866085  opt   1         False   

     physical_cj  
0           True  
1           True  
2           True  
3           True  
4   

25/09/22 15:10:35 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, info#80956, info#81432, id#80973, id#80983, id#81433], [toprettystring(min(name#80930), Some(GMT)) AS toprettystring(movie_company)#81461, toprettystring(min(info#81436), Some(GMT)) AS toprettystring(rating)#81462, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(drama_horror_movie)#81463, toprettystring(name_pcode_nf#80933, Some(GMT)) AS toprettystring(name_pcode_nf)#81464, toprettystring(kind#80944, Some(GMT)) AS toprettystring(kind)#81465, toprettystring(info#80956, Some(GMT)) AS toprettystring(info)#81466, toprettystring(info#81432, Some(GMT)) AS toprettystring(info)#81467, toprettystring(id#80973, Some(GMT)) AS toprettystring(id)#81468, toprettystring(id#80983, Some(GMT)) AS toprettystring(id)#81469, toprettystring(id#81433, Some(GMT)) AS toprettystring(id)#81470]
+- Project [name#80930, name_pcode_nf#80933, kind#80944, info#80956, info#81432, id#809

+--------------------+------+--------------------+-------------+--------------------+------+------+-------+--------+--------+
|       movie_company|rating|  drama_horror_movie|name_pcode_nf|                kind|  info|  info|     id|      id|      id|
+--------------------+------+--------------------+-------------+--------------------+------+------+-------+--------+--------+
|          ABC Family|   8.2|       The Middleman|        A1215|production companies|genres|rating|3427406|10157465|26576360|
|         ABC Studios|   8.1|      Criminal Minds|        A1232|production companies|genres|rating|2318567|10008912|25796669|
|         ABC Studios|   8.1|Lost: Missing Pieces|        A1232|production companies|genres|rating|2840572|10078997|26142395|
|Abeselom Productions|   8.2|13 Months of Suns...|        A1245|production companies|genres|rating|3702217|10199522|26753933|
|AspireHigher Prod...|   8.1|And Then She Was ...|        A2162|production companies|genres|rating|3770499|10340398|26

25/09/22 15:11:06 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, info#80956, info#81432, id#80973, id#80983, id#81433], [min(name#80930) AS movie_company#81428, min(info#81436) AS rating#81429, min(title#81040) AS drama_horror_movie#81430, name_pcode_nf#80933, kind#80944, info#80956, info#81432, id#80973, id#80983, id#81433]
+- Project [name#80930, name_pcode_nf#80933, kind#80944, info#80956, info#81432, id#80973, id#80983, id#81433, info#81436, title#81040]
   +- Join Inner, (((id#81039 = movie_id#80984) AND (id#81039 = movie_id#81434)) AND (id#81039 = movie_id#80974))
      :- Project [name#80930, name_pcode_nf#80933, id#80973, movie_id#80974, kind#80944, id#80983, movie_id#80984, info#80956, id#81433, movie_id#81434, info#81436, info#81432]
      :  +- Join Inner, (info_type_id#81435 = id#81431)
      :     :- Join Inner, ((movie_id#80974 = movie_id#81434) AND (movie_id#80984 = movie_id#81434))
      :     :  :- Project [name#80

                   query    runtime  peak_memory mode run  group_leaves  \
0    10a-unguarded-2.sql   7.204246    16.099049  opt   1         False   
1    11a-unguarded-2.sql   5.739406    16.705421  opt   1         False   
2    12a-unguarded-2.sql  22.504743    20.701471  opt   1         False   
3    13a-unguarded-2.sql  38.679313    27.703075  opt   1         False   
4    14a-unguarded-2.sql   8.781767    26.621153  opt   1         False   
..                   ...        ...          ...  ...  ..           ...   
286  33a-unguarded-6.sql  32.043667    22.829953  opt   2         False   
287   7a-unguarded-6.sql  40.177852    23.151915  opt   2         False   
288   9a-unguarded-6.sql  12.481005    23.160967  opt   2         False   
289  11a-unguarded-7.sql   9.048557    15.866085  opt   1         False   
290  12a-unguarded-7.sql  30.399519    21.604553  opt   1         False   

     physical_cj  
0           True  
1           True  
2           True  
3           True  
4   

25/09/22 15:11:06 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, info#80956, info#81670, kind#80966, id#80973, id#80983], [toprettystring(min(info#80986), Some(GMT)) AS toprettystring(release_date)#81699, toprettystring(min(info#81674), Some(GMT)) AS toprettystring(rating)#81700, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(german_movie)#81701, toprettystring(name_pcode_nf#80933, Some(GMT)) AS toprettystring(name_pcode_nf)#81702, toprettystring(kind#80944, Some(GMT)) AS toprettystring(kind)#81703, toprettystring(info#80956, Some(GMT)) AS toprettystring(info)#81704, toprettystring(info#81670, Some(GMT)) AS toprettystring(info)#81705, toprettystring(kind#80966, Some(GMT)) AS toprettystring(kind)#81706, toprettystring(id#80973, Some(GMT)) AS toprettystring(id)#81707, toprettystring(id#80983, Some(GMT)) AS toprettystring(id)#81708]
+- Project [name_pcode_nf#80933, kind#80944, info#80956, info#81670, kind#80966, id#8097

+--------------------+------+--------------------+-------------+--------------------+------+-------------+-----+-------+--------+
|        release_date|rating|        german_movie|name_pcode_nf|                kind|  info|         info| kind|     id|      id|
+--------------------+------+--------------------+-------------+--------------------+------+-------------+-----+-------+--------+
|West Germany:22 A...|   4.9|Am Ufer der Dämme...|        A1213|production companies|rating|release dates|movie|3761058|23622107|
|West Germany:26 A...|   7.6|       Abrahams Gold|        A1231|production companies|rating|release dates|movie|3739178|23586228|
|Germany:12 Septem...|   7.6|       Abrahams Gold|        A1231|production companies|rating|release dates|movie|3739178|23586229|
|Canada:30 August ...|   6.6|              Anansi|        A1231|production companies|rating|release dates|movie|3769747|23636132|
|USA:16 November 2002|   6.6|              Anansi|        A1231|production companies|ratin

25/09/22 15:11:56 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_nf#80933, kind#80944, info#80956, info#81670, kind#80966, id#80973, id#80983], [min(info#80986) AS release_date#81666, min(info#81674) AS rating#81667, min(title#81040) AS german_movie#81668, name_pcode_nf#80933, kind#80944, info#80956, info#81670, kind#80966, id#80973, id#80983]
+- Project [name_pcode_nf#80933, kind#80944, info#80956, info#81670, kind#80966, id#80973, id#80983, info#80986, info#81674, title#81040]
   +- Join Inner, (id#80965 = kind_id#81042)
      :- Project [name_pcode_nf#80933, id#80973, kind#80944, id#80983, info#80986, info#81670, info#81674, info#80956, title#81040, kind_id#81042]
      :  +- Join Inner, (((movie_id#80984 = id#81039) AND (movie_id#80974 = id#81039)) AND (movie_id#81672 = id#81039))
      :     :- Project [name_pcode_nf#80933, id#80973, movie_id#80974, kind#80944, id#80983, movie_id#80984, info#80986, info#81670, movie_id#81672, info#81674, info#80956

                   query    runtime  peak_memory mode run  group_leaves  \
0    10a-unguarded-2.sql   7.204246    16.099049  opt   1         False   
1    11a-unguarded-2.sql   5.739406    16.705421  opt   1         False   
2    12a-unguarded-2.sql  22.504743    20.701471  opt   1         False   
3    13a-unguarded-2.sql  38.679313    27.703075  opt   1         False   
4    14a-unguarded-2.sql   8.781767    26.621153  opt   1         False   
..                   ...        ...          ...  ...  ..           ...   
287   7a-unguarded-6.sql  40.177852    23.151915  opt   2         False   
288   9a-unguarded-6.sql  12.481005    23.160967  opt   2         False   
289  11a-unguarded-7.sql   9.048557    15.866085  opt   1         False   
290  12a-unguarded-7.sql  30.399519    21.604553  opt   1         False   
291  13a-unguarded-7.sql  49.207962    27.261850  opt   1         False   

     physical_cj  
0           True  
1           True  
2           True  
3           True  
4   

25/09/22 15:11:56 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [info#80956, info#81913, phonetic_code#80961, kind#80966, id#80983, id#81914, id#80993], [toprettystring(min(info#81917), Some(GMT)) AS toprettystring(rating)#81939, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(northern_dark_movie)#81940, toprettystring(info#80956, Some(GMT)) AS toprettystring(info)#81941, toprettystring(info#81913, Some(GMT)) AS toprettystring(info)#81942, toprettystring(phonetic_code#80961, Some(GMT)) AS toprettystring(phonetic_code)#81943, toprettystring(kind#80966, Some(GMT)) AS toprettystring(kind)#81944, toprettystring(id#80983, Some(GMT)) AS toprettystring(id)#81945, toprettystring(id#81914, Some(GMT)) AS toprettystring(id)#81946, toprettystring(id#80993, Some(GMT)) AS toprettystring(id)#81947]
+- Project [info#80956, info#81913, phonetic_code#80961, kind#80966, id#80983, id#81914, info#81917, id#80993, title#81040]
   +- Join Inner, (id#80965 = kind_id#81042)


+------+--------------------+---------+------+-------------+-----+-------+--------+-------+
|rating| northern_dark_movie|     info|  info|phonetic_code| kind|     id|      id|     id|
+------+--------------------+---------+------+-------------+-----+-------+--------+-------+
|   3.1|             #Horror|countries|rating|          B43|movie|8477603|26748995|1961481|
|   7.1|     #halloweenparty|countries|rating|          B43|movie|8477957|26749154|1962130|
|   5.5|                  +1|countries|rating|          B43|movie|8479416|26750741|1968686|
|   7.2| 10 Cloverfield Lane|countries|rating|          B43|movie|8480705|26751989|1974117|
|   4.0|100 Ghost Street:...|countries|rating|          B43|movie|8481176|26752379|1977580|
|   8.1|    12 Years a Slave|countries|rating|          B43|movie|8482553|26753615|1982667|
|   7.3|            13 Hours|countries|rating|          B43|movie|8482866|26753918|1984227|
|   8.2|                13th|countries|rating|          B43|movie|8483071|267541

25/09/22 15:12:08 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [info#80956, info#81913, phonetic_code#80961, kind#80966, id#80983, id#81914, id#80993], [min(info#81917) AS rating#81910, min(title#81040) AS northern_dark_movie#81911, info#80956, info#81913, phonetic_code#80961, kind#80966, id#80983, id#81914, id#80993]
+- Project [info#80956, info#81913, phonetic_code#80961, kind#80966, id#80983, id#81914, info#81917, id#80993, title#81040]
   +- Join Inner, (id#80965 = kind_id#81042)
      :- Project [info#80956, id#80983, id#81914, info#81917, info#81913, id#80993, phonetic_code#80961, title#81040, kind_id#81042]
      :  +- Join Inner, (((id#81039 = movie_id#80984) AND (id#81039 = movie_id#80994)) AND (id#81039 = movie_id#81915))
      :     :- Project [info#80956, id#80983, movie_id#80984, id#81914, movie_id#81915, info#81917, info#81913, id#80993, movie_id#80994, phonetic_code#80961]
      :     :  +- Join Inner, (id#80959 = keyword_id#80995)
      :     :   

                   query    runtime  peak_memory mode run  group_leaves  \
0    10a-unguarded-2.sql   7.204246    16.099049  opt   1         False   
1    11a-unguarded-2.sql   5.739406    16.705421  opt   1         False   
2    12a-unguarded-2.sql  22.504743    20.701471  opt   1         False   
3    13a-unguarded-2.sql  38.679313    27.703075  opt   1         False   
4    14a-unguarded-2.sql   8.781767    26.621153  opt   1         False   
..                   ...        ...          ...  ...  ..           ...   
288   9a-unguarded-6.sql  12.481005    23.160967  opt   2         False   
289  11a-unguarded-7.sql   9.048557    15.866085  opt   1         False   
290  12a-unguarded-7.sql  30.399519    21.604553  opt   1         False   
291  13a-unguarded-7.sql  49.207962    27.261850  opt   1         False   
292  14a-unguarded-7.sql  11.404743    27.180163  opt   1         False   

     physical_cj  
0           True  
1           True  
2           True  
3           True  
4   

25/09/22 15:12:09 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [id#80873, name_pcode_nf#80933, kind#80944, info#80956, phonetic_code#80961, id#80973, id#80983], [toprettystring(min(info#80986), Some(GMT)) AS toprettystring(release_date)#82134, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(internet_movie)#82135, toprettystring(id#80873, Some(GMT)) AS toprettystring(id)#82136, toprettystring(name_pcode_nf#80933, Some(GMT)) AS toprettystring(name_pcode_nf)#82137, toprettystring(kind#80944, Some(GMT)) AS toprettystring(kind)#82138, toprettystring(info#80956, Some(GMT)) AS toprettystring(info)#82139, toprettystring(phonetic_code#80961, Some(GMT)) AS toprettystring(phonetic_code)#82140, toprettystring(id#80973, Some(GMT)) AS toprettystring(id)#82141, toprettystring(id#80983, Some(GMT)) AS toprettystring(id)#82142]
+- Project [id#80873, name_pcode_nf#80933, kind#80944, info#80956, phonetic_code#80961, id#80973, id#80983, info#80986, title#81040]
   +- Jo

+---------------+--------------+----+-------------+------------+-------------+-------------+-----+--------+
|   release_date|internet_movie|  id|name_pcode_nf|        kind|         info|phonetic_code|   id|      id|
+---------------+--------------+----+-------------+------------+-------------+-------------+-----+--------+
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|         A542|48374|20389608|
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|        A5452|48374|20389608|
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|         B421|48374|20389608|
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|        B6232|48374|20389608|
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|        C4362|48374|20389608|
|USA:17 May 2004|    Ass Parade|3507|        B5216|distributors|release dates|         C523|48374|20389608|
|USA:17 May 2004|    Ass Par

25/09/22 15:12:19 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [id#80873, name_pcode_nf#80933, kind#80944, info#80956, phonetic_code#80961, id#80973, id#80983], [min(info#80986) AS release_date#82112, min(title#81040) AS internet_movie#82113, id#80873, name_pcode_nf#80933, kind#80944, info#80956, phonetic_code#80961, id#80973, id#80983]
+- Project [id#80873, name_pcode_nf#80933, kind#80944, info#80956, phonetic_code#80961, id#80973, id#80983, info#80986, title#81040]
   +- Join Inner, ((((id#81039 = movie_id#80874) AND (id#81039 = movie_id#80984)) AND (id#81039 = movie_id#80994)) AND (id#81039 = movie_id#80974))
      :- Project [id#80873, movie_id#80874, id#80973, movie_id#80974, name_pcode_nf#80933, kind#80944, id#80983, movie_id#80984, info#80986, info#80956, movie_id#80994, phonetic_code#80961]
      :  +- Join Inner, (id#80959 = keyword_id#80995)
      :     :- Join Inner, (((movie_id#80994 = movie_id#80984) AND (movie_id#80994 = movie_id#80974)) AND (movie_

                   query    runtime  peak_memory mode run  group_leaves  \
0    10a-unguarded-2.sql   7.204246    16.099049  opt   1         False   
1    11a-unguarded-2.sql   5.739406    16.705421  opt   1         False   
2    12a-unguarded-2.sql  22.504743    20.701471  opt   1         False   
3    13a-unguarded-2.sql  38.679313    27.703075  opt   1         False   
4    14a-unguarded-2.sql   8.781767    26.621153  opt   1         False   
..                   ...        ...          ...  ...  ..           ...   
289  11a-unguarded-7.sql   9.048557    15.866085  opt   1         False   
290  12a-unguarded-7.sql  30.399519    21.604553  opt   1         False   
291  13a-unguarded-7.sql  49.207962    27.261850  opt   1         False   
292  14a-unguarded-7.sql  11.404743    27.180163  opt   1         False   
293  15a-unguarded-7.sql   9.031199    15.464518  opt   1         False   

     physical_cj  
0           True  
1           True  
2           True  
3           True  
4   

25/09/22 15:12:19 WARN RewriteJoinsAsSemijoins: applying rewriting to join: Aggregate [name_pcode_cf#80861, nr_order#80902, name_pcode_nf#80933, phonetic_code#80961, id#80973, id#80993, md5sum#81015], [toprettystring(min(name#80859), Some(GMT)) AS toprettystring(cool_actor_pseudonym)#82337, toprettystring(min(title#81040), Some(GMT)) AS toprettystring(series_named_after_char)#82338, toprettystring(name_pcode_cf#80861, Some(GMT)) AS toprettystring(name_pcode_cf)#82339, toprettystring(nr_order#80902, Some(GMT)) AS toprettystring(nr_order)#82340, toprettystring(name_pcode_nf#80933, Some(GMT)) AS toprettystring(name_pcode_nf)#82341, toprettystring(phonetic_code#80961, Some(GMT)) AS toprettystring(phonetic_code)#82342, toprettystring(id#80973, Some(GMT)) AS toprettystring(id)#82343, toprettystring(id#80993, Some(GMT)) AS toprettystring(id)#82344, toprettystring(md5sum#81015, Some(GMT)) AS toprettystring(md5sum)#82345]
+- Project [name#80859, name_pcode_cf#80861, nr_order#80902, name_pcode_n

## STATS Benchmark

In [None]:
#### benchmark configuration
dbname = 'stats'
#mode = 'opt'
runs = ['1', '2', '3', '4', '5', '6']
#runs = ['04']
#runs = ['01']
####

queries = sorted(glob.glob('stats-queries/*.sql'))
queries_hint = sorted(glob.glob('stats-queries/hints/*.sql'))

print('running queries: ' + str(queries))
#benchmark_all(dbname, 'opt', runs, queries, physical_cj=True)
benchmark_all(dbname, 'opt', runs, queries_hint, group_in_leaves=True, physical_cj=True)
#benchmark_all(dbname, 'ref', runs, queries)

## hetionet

In [None]:
#### benchmark configuration
dbname = 'hetio'
#mode = 'opt'
runs = ['1', '2', '3', '4', '5', '6']
#runs = ['04']
#runs = ['01']
####

queries = sorted(glob.glob('hetio/*.sql'))
#queries = ['hetio/CtDpSpD.sql']

print('running queries: ' + str(queries))
#benchmark_all(dbname, 'opt', runs, queries, physical_cj=True)
benchmark_all(dbname, 'opt', runs, queries, group_in_leaves=False, physical_cj=True)
benchmark_all(dbname, 'opt', runs, queries, group_in_leaves=False, physical_cj=False)
benchmark_all(dbname, 'ref', runs, queries)
#benchmark_all(dbname, 'ref', runs, queries)

In [None]:
spark = create_spark()
#import_db(spark, 'stats')
import_db(spark, 'lsqb')

In [None]:
spark.sparkContext.setLogLevel("INFO")
spark.sql("SET spark.sql.yannakakis.enabled = false").show()
#spark.sql("SET spark.sql.yannakakis.enabled = false").show()
#df = run_query(spark, 'stats-queries/142-135.sql')
#df = run_query(spark, 'stats-queries/hints/142-135-hint.sql')
#df = run_query(spark, 'stats-queries/hints/141-068-hint.sql')
df = run_query(spark, 'lsqb/sql/q1.sql')
df.show()
print(explain_str(df))

In [None]:

df = spark.sql('select count(*) from comments as c')
df.show()

In [None]:
spark.sql('cache table comments').show()

In [None]:
a1 = 'a'
a2 = 'a'

set([a1, 'b']) - set([a2])