In [2]:
from pyspark.sql import SparkSession
import os


In [3]:
spark = SparkSession.builder \
        .master("local[60]") \
        .appName("app") \
        .config("spark.driver.memory", "900g") \
        .config("spark.executor.memory", "900g") \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
#spark.sparkContext.setLogLevel("DEBUG")

23/11/10 15:24:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
spark.sparkContext.setLogLevel("WARN") # ALL, DEBUG, WARN,
#spark.sparkContext.setLogLevel("ALL") # ALL, DEBUG, WARN,

In [3]:
username = os.environ.get('USERNAME', 'tpch')
password = os.environ.get('PASSWORD', 'tpch')
dbname = os.environ.get('DBNAME', 'tpch')
dbhost = os.environ.get('DBHOST', 'postgres')

df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://{dbhost}:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{dbhost}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            #print(df.show())
            df.createOrReplaceTempView(table_name)
            spark.catalog.cacheTable(table_name)

part
supplier
partsupp
customer
orders
lineitem
nation
region


In [None]:
spark.sql("SET spark.sql.yannakakis.countGroupInLeaves = false").show()

In [None]:
spark.sql("SET spark.sql.yannakakis.enabled = false").show()

In [None]:
spark.sql("SET spark.sql.yannakakis.enabled = true").show()
spark.sql("SET spark.sql.yannakakis.countGroupInLeaves = false").show()

In [None]:
spark.sql("SET spark.local.dir").show()

In [None]:
spark.sql("ANALYZE TABLE part COMPUTE STATISTICS;").show()

In [None]:
df = spark.sql("""select
        /*+ FK(ps_partkey, p_partkey), FK(n_regionkey, r_regionkey) */
		MEDIAN(p_size)
		from
            part,
			partsupp,
			supplier,
			nation,
			region
		where
			p_partkey = ps_partkey
			and s_suppkey = ps_suppkey
			and s_nationkey = n_nationkey
			and n_regionkey = r_regionkey""")

df.show(500)

df.explain(True)

In [5]:
def run_query(file):
    with open(file, 'r') as f:
        query = '\n'.join(filter(lambda line: not line.startswith('limit') and not line.startswith('-'), f.readlines()))
        
        print("running query: \n" + query)
        return spark.sql(query)

In [None]:
df = spark.sql("""
SELECT ps_partkey, count(*) from part, partsupp
WHERE p_partkey = ps_partkey
GROUP BY ps_partkey
""")

df.show()

df.explain(True)

In [None]:
df = spark.sql("SELECT COUNT(p_size) / COUNT(DISTINCT p_size) FROM part")
df.show()
df = spark.sql("SELECT COUNT(p_retailprice) / COUNT(DISTINCT p_retailprice) FROM part")
df.show()

In [None]:
df_t1 = spark.createDataFrame([(1,1), (2,1), (2,2), (3,2), (3,3), (4,3), (4,3), (5,2), (5,1), (6,4)], schema=("a","b"))
df_t1.createOrReplaceTempView("t1")
df_t2 = spark.createDataFrame([(1,1), (2,1), (3,2), (3,2), (3,3), (3,3), (4,3), (4,2), (5,1), (6,4)], schema=("c","d"))
df_t2.createOrReplaceTempView("t2")
df_t3 = spark.createDataFrame([(1,1), (2,1), (3,2), (3,2), (3,3), (3,3), (4,3), (4,2), (5,1), (6,4)], schema=("e","f"))
df_t3.createOrReplaceTempView("t3")

query = "select median(a) from t1, t2 where b = c"
#query = "select percentile(a, 0.5, b) from t1, t2 where b = c"
#query = "select median(a) from t1 where EXISTS (SELECT 1 FROM t2 WHERE b = c)"
#query = "select count(*) from t1, t2 where b = c"
#query = "select *a from t1 where EXISTS (SELECT 1 FROM t2 WHERE b = c)"

spark.sql("SET spark.sql.yannakakis.enabled = false").show()

df = spark.sql(query)
df.show()

spark.sql("SET spark.sql.yannakakis.enabled = true").show()

df = spark.sql(query)
df.show()

In [None]:
spark.conf.set("spark.sql.legacy.setCommandRejectsSparkCoreConfs","false")
#spark.conf.set("spark.executor.cores", "6")
#spark.conf.set("spark.executor.instances", "6")
spark.conf.set("spark.sql.shuffle.partitions", "6")

In [None]:
import pandas as pd
import time

def benchmark(query):
    df0 = run_query(query)
    df0.show()
    
    spark.sql("SET spark.sql.yannakakis.enabled = true").show()

    start_time = time.time()

    df1 = run_query(query)
    df1.show()
    #df1.explain(True)

    end_time = time.time()
    yannakakis_time = end_time - start_time

    spark.sql("SET spark.sql.yannakakis.enabled = false").show()

    start_time = time.time()

    df2 = run_query(query)
    df2.show()
    #df2.explain(True)

    end_time = time.time()
    ref_time = end_time - start_time
    
    #return [query, ref_time, yannakakis_time]
    return [query, ref_time, yannakakis_time]


queries = ['tpch-kit/dbgen/queries/postgres/2.sql',
           'tpch-kit/dbgen/queries/postgres/11.sql', 
           'tpch-kit/dbgen/queries/postgres/11-hint.sql',
           'median-1.sql',
           'median-2.sql', 
           'median-3.sql', 
           'median-4.sql', 
           'median-5.sql',
            'median-1-hint.sql',
           'median-2-hint.sql', 
           'median-3-hint.sql', 
           'median-4-hint.sql', 
           'median-5-hint.sql']

results = [benchmark(q) for q in queries]

df = pd.DataFrame(results, columns = ['query', 'ref_time', 'yannakakis_time'])

print(df)

df.to_csv("results.csv")
    

#print(f'row count: {df1.count()} vs. {df2.count()}' )
    #print(f'time ref: {ref_time}\ntime yannakakis: {yannakakis_time}')

In [None]:
#spark.conf.set("spark.sql.legacy.setCommandRejectsSparkCoreConfs","false")
#spark.conf.set("spark.executor.cores", "1")
#spark.conf.set("spark.executor.instances", "1")
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [7]:
## Compare result
import time
query = 'tpch-kit/dbgen/queries/postgres/2.sql'
#query = 'tpch-kit/dbgen/queries/postgres/13.sql'
#query = 'count-3.sql'
#query = 'tpch-kit/dbgen/queries/postgres/11.sql'
#query = '11-simple.sql'
query = 'median-1-hint.sql'
#query = 'median-1.sql'
#query = 'tpch-kit/dbgen/queries/postgres/7.sql'
#query = '13-simple.sql'
#query = 'subselect-exists.sql'
#query = 'min-1.sql'

spark.sql("SET spark.sql.yannakakis.enabled = true").show()

start_time = time.time()

df1 = run_query(query)
df1.show()
df1.explain(mode="extended")

end_time = time.time()
yannakakis_time = end_time - start_time

spark.sql("SET spark.sql.yannakakis.enabled = false").show()

start_time = time.time()

df2 = run_query(query)
df2.show()
df2.explain(mode="extended")

end_time = time.time()
ref_time = end_time - start_time

#print(f'row count: {df1.count()} vs. {df2.count()}' )
print(f'time ref: {ref_time}\ntime yannakakis: {yannakakis_time}')

+--------------------+-----+
|                 key|value|
+--------------------+-----+
|spark.sql.yannaka...| true|
+--------------------+-----+

running query: 
select

/*+ FK(ps_partkey, p_partkey), FK(n_regionkey, r_regionkey), FK(ps_suppkey, s_suppkey), FK(s_nationkey, n_nationkey), PK(ps_partkey, ps_suppkey) */

        median(s_acctbal)

		from

            part,

			partsupp,

			supplier,

			nation,

			region

		where

			p_partkey = ps_partkey

			and s_suppkey = ps_suppkey

			and s_nationkey = n_nationkey

			and n_regionkey = r_regionkey


23/11/02 14:44:33 WARN RewriteJoinsAsSemijoins: applying yannakakis rewriting to join: Aggregate [toprettystring(percentile(s_acctbal#104, 0.5, 1, 0, 0, false), Some(Etc/UTC)) AS toprettystring(median(s_acctbal))#3033]
+- Project [s_acctbal#104]
   +- FKHint [[ps_partkey#157L, p_partkey#24], [ps_suppkey#158L, s_suppkey#99], [n_regionkey#472L, r_regionkey#503], [s_nationkey#102L, n_nationkey#470]], [[ps_partkey#157L, ps_suppkey#158L]]
      +- Join Inner, (n_regionkey#472L = cast(r_regionkey#503 as bigint))
         :- Join Inner, (s_nationkey#102L = cast(n_nationkey#470 as bigint))
         :  :- Join Inner, (cast(s_suppkey#99 as bigint) = ps_suppkey#158L)
         :  :  :- Join Inner, (cast(p_partkey#24 as bigint) = ps_partkey#157L)
         :  :  :  :- Project [p_partkey#24]
         :  :  :  :  +- Filter isnotnull(p_partkey#24)
         :  :  :  :     +- InMemoryRelation [p_partkey#24, p_name#25, p_mfgr#33, p_brand#34, p_type#28, p_size#29, p_container#35, p_retailprice#31, p_commen

23/11/02 14:44:44 WARN RewriteJoinsAsSemijoins: applying yannakakis rewriting to join: Aggregate [percentile(s_acctbal#104, 0.5, 1, 0, 0, false) AS median(s_acctbal)#3030]
+- Project [s_acctbal#104]
   +- FKHint [[ps_partkey#157L, p_partkey#24], [ps_suppkey#158L, s_suppkey#99], [n_regionkey#472L, r_regionkey#503], [s_nationkey#102L, n_nationkey#470]], [[ps_partkey#157L, ps_suppkey#158L]]
      +- Join Inner, (n_regionkey#472L = cast(r_regionkey#503 as bigint))
         :- Join Inner, (s_nationkey#102L = cast(n_nationkey#470 as bigint))
         :  :- Join Inner, (cast(s_suppkey#99 as bigint) = ps_suppkey#158L)
         :  :  :- Join Inner, (cast(p_partkey#24 as bigint) = ps_partkey#157L)
         :  :  :  :- Project [p_partkey#24]
         :  :  :  :  +- Filter isnotnull(p_partkey#24)
         :  :  :  :     +- InMemoryRelation [p_partkey#24, p_name#25, p_mfgr#33, p_brand#34, p_type#28, p_size#29, p_container#35, p_retailprice#31, p_comment#32], StorageLevel(disk, memory, deserialized,

+-----------------+
|median(s_acctbal)|
+-----------------+
|4499.360000000001|
+-----------------+

== Parsed Logical Plan ==
'UnresolvedHint FK, ['ps_partkey, 'p_partkey]
+- 'UnresolvedHint FK, ['n_regionkey, 'r_regionkey]
   +- 'UnresolvedHint FK, ['ps_suppkey, 's_suppkey]
      +- 'UnresolvedHint FK, ['s_nationkey, 'n_nationkey]
         +- 'UnresolvedHint PK, ['ps_partkey, 'ps_suppkey]
            +- 'Project [unresolvedalias('median('s_acctbal), None)]
               +- 'Filter ((('p_partkey = 'ps_partkey) AND ('s_suppkey = 'ps_suppkey)) AND (('s_nationkey = 'n_nationkey) AND ('n_regionkey = 'r_regionkey)))
                  +- 'Join Inner
                     :- 'Join Inner
                     :  :- 'Join Inner
                     :  :  :- 'Join Inner
                     :  :  :  :- 'UnresolvedRelation [part], [], false
                     :  :  :  +- 'UnresolvedRelation [partsupp], [], false
                     :  :  +- 'UnresolvedRelation [supplier], [], false
           

[Stage 87:>                                                         (0 + 1) / 1]

+-----------------+
|median(s_acctbal)|
+-----------------+
|4499.360000000001|
+-----------------+

== Parsed Logical Plan ==
'UnresolvedHint FK, ['ps_partkey, 'p_partkey]
+- 'UnresolvedHint FK, ['n_regionkey, 'r_regionkey]
   +- 'UnresolvedHint FK, ['ps_suppkey, 's_suppkey]
      +- 'UnresolvedHint FK, ['s_nationkey, 'n_nationkey]
         +- 'UnresolvedHint PK, ['ps_partkey, 'ps_suppkey]
            +- 'Project [unresolvedalias('median('s_acctbal), None)]
               +- 'Filter ((('p_partkey = 'ps_partkey) AND ('s_suppkey = 'ps_suppkey)) AND (('s_nationkey = 'n_nationkey) AND ('n_regionkey = 'r_regionkey)))
                  +- 'Join Inner
                     :- 'Join Inner
                     :  :- 'Join Inner
                     :  :  :- 'Join Inner
                     :  :  :  :- 'UnresolvedRelation [part], [], false
                     :  :  :  +- 'UnresolvedRelation [partsupp], [], false
                     :  :  +- 'UnresolvedRelation [supplier], [], false
           

                                                                                