In [24]:
from pyspark.sql import SparkSession, DataFrame, Column, Row
from pyspark.sql.types import IntegerType

In [75]:
builder = SparkSession.builder \
    .appName("Basic Cardinality Estimation") \
    .config("spark.sql.cbo.enabled", True) \
    .config("spark.master", "local[*]") \
    .config("spark.sql.cbo.joinReorder.enabled", True) \
    .config("spark.sql.cbo.joinReorder.dp.threshold", 16) \
    .config("spark.sql.statistics.histogram.enabled", True) \
    .config("spark.sql.statistics.histogram.numBins", 25) \
    .enableHiveSupport()

spark = builder.getOrCreate()

In [76]:
import pandas as pd
import numpy as np

In [77]:
def get_uniform_ints_df():
    pd_df = pd.DataFrame(np.random.randint(0,50,[500], np.int))
    df = spark.createDataFrame(pd_df, ["x"])
    return df

def create_uniform_tables():
    #create 4 tables
    table_names = ["A", "B", "C", "D"]
    for i in range(4):
        table_name = table_names[i]
        df = get_uniform_ints_df()
        df.write.mode("overwrite").saveAsTable(table_name)
        spark.sql("Analyze Table " + table_name + " compute statistics")
        spark.sql("Analyze Table " + table_name + " compute statistics for columns " + "x")
        spark.sql("Describe extended " + table_name).show();
    

In [78]:
create_uniform_tables()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                   x|              bigint|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|             default|       |
|               Table|                   a|       |
|               Owner|        amogkamsetty|       |
|        Created Time|Mon Nov 12 22:45:...|       |
|         Last Access|Thu Jan 01 00:00:...|       |
|          Created By|         Spark 2.4.0|       |
|                Type|             MANAGED|       |
|            Provider|             parquet|       |
|    Table Properties|[transient_lastDd...|       |
|          Statistics|2986 bytes, 500 rows|       |
|            Location|file:/mnt/c/Users...|       |
|       Serde Library|org.apache.hadoop...|       |
|         InputFormat|org.apache.hadoop...|       |
|        Out

In [109]:
stats = {}
stats["A"] = spark.sql("describe extended A x").rdd.collectAsMap()
stats["B"] = spark.sql("describe extended B x").rdd.collectAsMap()
stats["C"] = spark.sql("describe extended C x").rdd.collectAsMap()
stats["D"] = spark.sql("describe extended D x").rdd.collectAsMap()

In [111]:
result = spark.sql("Select * from A where A.x > 25")
result.count()
df_pandas = stats["A"]
df_pandas

{'avg_col_len': '8',
 'bin_0': 'lower_bound: 0.0, upper_bound: 1.0, distinct_count: 2',
 'bin_1': 'lower_bound: 1.0, upper_bound: 3.0, distinct_count: 2',
 'bin_10': 'lower_bound: 19.0, upper_bound: 21.0, distinct_count: 2',
 'bin_11': 'lower_bound: 21.0, upper_bound: 24.0, distinct_count: 3',
 'bin_12': 'lower_bound: 24.0, upper_bound: 26.0, distinct_count: 2',
 'bin_13': 'lower_bound: 26.0, upper_bound: 28.0, distinct_count: 2',
 'bin_14': 'lower_bound: 28.0, upper_bound: 30.0, distinct_count: 2',
 'bin_15': 'lower_bound: 30.0, upper_bound: 32.0, distinct_count: 2',
 'bin_16': 'lower_bound: 32.0, upper_bound: 34.0, distinct_count: 2',
 'bin_17': 'lower_bound: 34.0, upper_bound: 37.0, distinct_count: 3',
 'bin_18': 'lower_bound: 37.0, upper_bound: 39.0, distinct_count: 2',
 'bin_19': 'lower_bound: 39.0, upper_bound: 40.0, distinct_count: 1',
 'bin_2': 'lower_bound: 3.0, upper_bound: 6.0, distinct_count: 3',
 'bin_20': 'lower_bound: 40.0, upper_bound: 42.0, distinct_count: 2',
 'bin_21