# JDBC Predicate Pushdown

In [0]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Predicate Pushdown") \
    .config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.39.3.0') \
    .master("local[*]") \
    .getOrCreate()

sparks

In [0]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [0]:
# Set up to read from JDBC SQLite database
driver: str = "org.sqlite.JDBC"
db_path: str = "dataset/jdbc/demo-sqlite.db"
jdbc_url: str = "jdbc:sqlite:" + db_path
table_name: str = "sales_csv"

In [0]:
# Lets read the SQLite table using JDBC driver and validate the data
df = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load()

df.printSchema()
df.show(10, False)

In [0]:
# Checking the explain plan
df.explain(True)

In [0]:
# Checking the performance for Full read without any Predicate Pushdown
@get_time
def x():
    df_full = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load()
    
    df_full.write.format("noop").mode("overwrite").save()
    df_full.explain(True)

In [0]:
# Checking the performance for Predicate Pushdown
@get_time
def x():
    df_filtered = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load() \
    .filter("city_id = 216510442")
    
    df_filtered.write.format("noop").mode("overwrite").save()
    df_filtered.explain(True)

In [0]:
# Cripple the performance for Predicate Pushdown
@get_time
def x():
    df_filtered = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .load() \
    .cache() \
    .filter("city_id = 216510442")
    
    df_filtered.write.format("noop").mode("overwrite").save()
    df_filtered.explain(True)

In [0]:
# We can even push down Queries for perfomance benifits
pushDownQuery = """(select city_id, count(1) as cnt from sales_csv group by city_id) as sales_csv"""
@get_time
def x():
    df_filtered = spark \
    .read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", jdbc_url) \
    .option("dbtable", pushDownQuery) \
    .load()
    
    df_filtered.write.format("noop").mode("overwrite").save()
    df_filtered.explain(True)