In [86]:
import pandas as pd
import numpy as np
from functools import reduce
from datetime import datetime
from collections import Counter
import itertools
import math
import operator
import json

In [87]:
# Config

BENCHMARK = "TPCDS"
CHUNK_SIZE = 65535

if BENCHMARK == "TPCH":
    SCALE_FACTOR = 1
    RUNS = -1
    TIME = 60
    
    STATISTICS_PATH = f"~/Dokumente/repos/example_plugin/TPC-H__SF_{SCALE_FACTOR}.000000__RUNS_{RUNS}__TIME_{TIME}"
    
    
elif BENCHMARK == "TPCDS":
    SCALE_FACTOR = 1
    RUNS = -1
    TIME = 60
    
    STATISTICS_PATH = f"~/Dokumente/repos/example_plugin/TPC-DS__SF_{SCALE_FACTOR}.000000__RUNS_{RUNS}__TIME_{TIME}"
else:
    raise Exception("Unknown benchmark: " + BENCHMARK)

print(f"Model is configured for {BENCHMARK} (chunk size {CHUNK_SIZE}) with scale factor {SCALE_FACTOR}, {TIME} seconds runtime, and at most {RUNS} runs per query")

Model is configured for TPCDS (chunk size 65535) with scale factor 1, 60 seconds runtime, and at most -1 runs per query


In [88]:
# Load table scan statistics

path = f"{STATISTICS_PATH}/table_scans.csv"
scans = pd.read_csv(path, sep='|')
EXPECTED_SCAN_COUNT = len(scans)
LOADED_CHUNK_SIZE = CHUNK_SIZE
LOADED_BENCHMARK = BENCHMARK
LOADED_SCALE_FACTOR = SCALE_FACTOR
LOADED_RUNS = RUNS
LOADED_TIME = TIME

print(f"Successfully loaded {path}")

def assert_correct_statistics_loaded():
    assert BENCHMARK == LOADED_BENCHMARK, f"The model is configured to use {BENCHMARK}, but {LOADED_BENCHMARK} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert SCALE_FACTOR == LOADED_SCALE_FACTOR, f"The model is configured to use {SCALE_FACTOR} as scale factor, but data for a scale factor of {LOADED_SCALE_FACTOR} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert RUNS == LOADED_RUNS, f"The model is configured to perform at most {RUNS} runs, but the currently loaded data had at most {LOADED_RUNS} runs.\nEither change the benchmark or re-run all cells"
    assert TIME == LOADED_TIME, f"The model is configured to run for {TIME} seconds, but the currently data had a runtime of {LOADED_TIME} seconds.\nEither change the benchmark or re-run all cells"
    assert CHUNK_SIZE == LOADED_CHUNK_SIZE, f"The model is configured to use {CHUNK_SIZE} as chunk_size, but data for a chunk size of {LOADED_CHUNK_SIZE} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert EXPECTED_SCAN_COUNT == len(scans), f"There should be {EXPECTED_SCAN_COUNT} table scans, but there are only {len(scans)}\nProbably one of the last commands reassigned it unintentionally"
    
    assert 'OPERATOR_POINTER' in scans.columns, f"the statistics in {STATISTICS_PATH} are outdated (column 'OPERATOR_POINTER' in table_scans.csv is missing). Please create them again."

Successfully loaded ~/Dokumente/repos/example_plugin/TPC-DS__SF_1.000000__RUNS_-1__TIME_60/table_scans.csv


In [89]:
# Validate table scans
assert_correct_statistics_loaded()

# To make sure pruning was not active,
# first fetch table sizes,
table_statistics = pd.read_csv(f"{STATISTICS_PATH}/table_meta_data.csv", sep='|')
table_sizes = dict(zip(table_statistics.table_name, table_statistics.row_count))

# then make sure INPUT_ROWS == table_size
def input_size_matches(row):
    #print(row)
    
    actual_row_count = row['INPUT_ROWS']
    table = row['TABLE_NAME']
    expected_row_count = table_sizes[table]
    return expected_row_count == actual_row_count

data_scans = scans[scans['COLUMN_TYPE'] == 'DATA']
input_size_matches = data_scans.apply(input_size_matches, axis=1)
all_sizes_match = reduce(np.logical_and, input_size_matches) #input_size_matches.apply()

if not all_sizes_match:
    raise Exception("The given statistics were probably created while pruning was active")
else:
    print("OK - looks like pruning was deactivated while the statistics were created")

OK - looks like pruning was deactivated while the statistics were created


In [90]:
# Append additional information to the table scans
assert_correct_statistics_loaded()

print(f"Statistics for {BENCHMARK} contain {len(scans)} table scans")


# Add statistics about selectivity and speed for each operator
scans['selectivity'] = scans['OUTPUT_ROWS'] / scans['INPUT_ROWS']

# TODO: Assumption that reading and writing a row have the same cost
scans['time_per_row'] = scans['RUNTIME_NS'] / (scans['INPUT_ROWS'] + scans['OUTPUT_ROWS'])
scans['time_per_input_row'] = scans['time_per_row']
scans['time_per_output_row'] = scans['time_per_row']


def determine_or_chains(table_scans):
    table_scans['part_of_or_chain'] = False
    
    single_table_scans = table_scans.groupby(['QUERY_HASH', 'TABLE_NAME', 'OPERATOR_POINTER'])
    
    for _, scans in single_table_scans:            
        input_row_frequencies = Counter(scans.INPUT_ROWS)
        or_input_sizes = set([input_size for input_size, frequency in input_row_frequencies.items() if frequency > 1])

        df = pd.DataFrame()
        df['INPUT_ROWS'] = scans['INPUT_ROWS']
        df['OUTPUT_ROWS'] = scans['OUTPUT_ROWS']
        df['part_of_or_chain'] = scans.apply(lambda row: row['INPUT_ROWS'] in or_input_sizes, axis=1)

        for _ in range(len(scans)):
            or_input_sizes |= set(df[df['part_of_or_chain']].OUTPUT_ROWS.unique())
            df['part_of_or_chain'] = df.apply(lambda row: row['INPUT_ROWS'] in or_input_sizes, axis=1)

        or_chains = list(df[df['part_of_or_chain']].index)
        table_scans.iloc[or_chains, table_scans.columns.get_loc('part_of_or_chain')] = True
    
    return table_scans

# Hyrise does not use scans that are part of an OR-chain for pruning
scans = determine_or_chains(scans)


# Like scans are not useful if they start with %
# TODO what if they dont start with % and contain more than one % ? -> up to first % prunable, but is it used?
def benefits_from_sorting(row):    
    description = row['DESCRIPTION']
    if "ColumnLike" in description:
        words = description.split()
        like_criteria = words[-1]
        assert "%" in like_criteria, f"LIKE operators should have an %, but found none in {like_criteria}"
        return like_criteria[1] != '%'
    elif "ExpressionEvaluator" in description and " IN " in description:
        return False
    else:
        return True

scans['benefits_from_sorting'] = scans.apply(benefits_from_sorting, axis=1)
# TODO: valid atm, but feels a bit hacky to assume not benefitting from sorted segments -> not benefitting from pruning
scans['useful_for_pruning'] = scans.apply(lambda row: not row['part_of_or_chain'] and row['benefits_from_sorting'] , axis=1)
EXPECTED_SCAN_COUNT = len(scans)
print(f"Of those, only {len(scans[scans['useful_for_pruning']])} are useful for pruning")

print("TODO: For now, filtering on scans is deactivated. This is because all scans are needed to recognize OR-Chains. Models have to take care themselves whether a scan can contribute to pruning or not")

Statistics for TPCDS contain 217 table scans
Of those, only 71 are useful for pruning
TODO: For now, filtering on scans is deactivated. This is because all scans are needed to recognize OR-Chains. Models have to take care themselves whether a scan can contribute to pruning or not


In [91]:
def test_determine_or_chains():
    test = pd.DataFrame()
    test['QUERY_HASH'] = pd.Series(['1']*3  + ['2']*4)
    test['TABLE_NAME'] = pd.Series(['lineitem']*3  + ['part']*4)
    test['OPERATOR_POINTER'] = pd.Series(['0x1'] + ['0x2']*2 + ['0x3']*4)
    test['COLUMN_NAME'] = pd.Series(['l_shipdate', 'l_shipdate', 'l_discount', 'p_brand', 'p_type', 'p_type', 'p_size'])
    test['INPUT_ROWS'] = pd.Series( [6001215, 6001215, 200000, 200000, 199000, 199000, 50000])
    test['OUTPUT_ROWS'] = pd.Series([ 400000,  300000, 200000, 199000,      0,  50000, 20000])
    test_result = determine_or_chains(test)
    assert len(test_result) == 7, "should not filter out any rows"    
    assert len(test_result[test_result['part_of_or_chain']]) == 3, "expected 3 scans, got\n" + str(test_result)
    assert list(test_result['part_of_or_chain']) == [False]*4 + [True]*3
    print("Test OK")

test_determine_or_chains()

Test OK


In [92]:
(scans['RUNTIME_NS'] - scans['SINGLE_RUNTIME_NS']).max()
# TODO can the actual runtime be that much greater than the runtime on the original table?

3147389

In [93]:
# Load query frequency information
assert_correct_statistics_loaded()

def get_query_frequencies():
    plan_cache = pd.read_csv(f"{STATISTICS_PATH}/plan_cache.csv", sep='|')
    return dict(zip(plan_cache.QUERY_HASH, plan_cache.EXECUTION_COUNT))

In [94]:
# Load column statistics - especially interesting: number of distinct values, and columns sorted during statistics creation

# Returns a 2-level-dictionary: distinct_values[TABLE][COLUMN] = number_of_distinct_values
def get_distinct_values_count():        
    # Code
    column_statistics_df = pd.read_csv(f"{STATISTICS_PATH}/column_meta_data.csv", sep='|')
    column_statistics_df['distinct_values'] = np.int32(column_statistics_df['distinct_values'])
    tables_and_columns = column_statistics_df.groupby('table_name')
    distinct_values = {table: dict(zip(column_df.column_name, column_df.distinct_values)) for table, column_df in tables_and_columns }

    
    # Test
    num_tables = len(distinct_values)
    if BENCHMARK == "TPCH":
        assert num_tables == 8, f"TPCH has 8 tables, but got {num_tables}"
    elif BENCHMARK == "TPCDS":
        assert num_tables == 24, f"TPCDS has 24 tables, but got {num_tables}"
    else:
        assert False, "Insert a benchmark specific check here"
    
    return distinct_values

# Returns a dictionary: sorted_columns_during_creation[TABLE] = [column1, column2, ...]
def get_sorted_columns_during_creation():
    # Code
    column_statistics_df = pd.read_csv(f"{STATISTICS_PATH}/column_meta_data.csv", sep='|')
    globally_sorted_columns = column_statistics_df[column_statistics_df['is_globally_sorted'] == 1]
    
    tables_and_columns = globally_sorted_columns.groupby('table_name')
    globally_sorted_columns = {table: list(column_df.column_name) for table, column_df in tables_and_columns }
    
    return globally_sorted_columns

In [95]:
### JOINS ###

assert_correct_statistics_loaded()

def load_join_statistics():
    def line_looks_suspicious(row):
        right_table_name = row['RIGHT_TABLE_NAME']    
        if pd.isnull(right_table_name):
            pass
        elif row['RIGHT_TABLE_ROW_COUNT'] > table_sizes[row['RIGHT_TABLE_NAME']]:
            return True

        left_table_name = row['LEFT_TABLE_NAME']
        if pd.isnull(left_table_name):
            pass
        elif row['LEFT_TABLE_ROW_COUNT'] > table_sizes[row['LEFT_TABLE_NAME']]:
            return True

        return False
    
    def validate_joins(joins):
        is_suspicious = joins.apply(line_looks_suspicious, axis=1)
        suspicious_joins = joins[is_suspicious]
        assert len(suspicious_joins) < 3, f"there are {len(suspicious_joins)} suspicious joins:\n{suspicious_joins[['JOIN_MODE', 'LEFT_TABLE_NAME', 'LEFT_COLUMN_NAME', 'LEFT_TABLE_ROW_COUNT', 'RIGHT_TABLE_NAME', 'RIGHT_COLUMN_NAME', 'RIGHT_TABLE_ROW_COUNT', 'PROBE_TABLE', 'PROBE_COLUMN', 'OUTPUT_ROWS']]}"
    
    joins = pd.read_csv(f"{STATISTICS_PATH}/joins.csv", sep=',')
    validate_joins(joins)
                                                                                           
    return joins

load_join_statistics().iloc[9:10][['JOIN_MODE', 'LEFT_TABLE_NAME', 'LEFT_COLUMN_NAME', 'LEFT_TABLE_ROW_COUNT', 'RIGHT_TABLE_NAME', 'RIGHT_COLUMN_NAME', 'RIGHT_TABLE_ROW_COUNT', 'PROBE_TABLE', 'PROBE_COLUMN', 'OUTPUT_ROWS']]

Unnamed: 0,JOIN_MODE,LEFT_TABLE_NAME,LEFT_COLUMN_NAME,LEFT_TABLE_ROW_COUNT,RIGHT_TABLE_NAME,RIGHT_COLUMN_NAME,RIGHT_TABLE_ROW_COUNT,PROBE_TABLE,PROBE_COLUMN,OUTPUT_ROWS
9,FullOuter,store_sales,ss_customer_sk,547500,catalog_sales,cs_bill_customer_sk,287466,,,834792


In [96]:
class AbstractModel:
    
    def __init__(self, query_frequencies, table_name, table_scans, correlations={}):
        self.query_frequencies = query_frequencies
        self.table_name = table_name
        self.table_scans = table_scans
        self.correlations = correlations
        
    def query_frequency(self, query_hash):
        return self.query_frequencies[query_hash]
        
    def extract_interesting_columns(self):
        useful_scans = self.table_scans[self.table_scans['useful_for_pruning']]
        interesting_scan_columns = list(useful_scans['COLUMN_NAME'].unique())
        interesting_join_probe_columns = list(self.joins[self.joins['PROBE_TABLE'] == self.table_name]['PROBE_COLUMN'].unique())
        interesting_join_build_columns = list(self.joins[self.joins['BUILD_TABLE'] == self.table_name]['BUILD_COLUMN'].unique())
        
        return self.uniquify(interesting_scan_columns + interesting_join_probe_columns + interesting_join_build_columns)
    
    def round_up_to_next_multiple(self, number_to_round, base_for_multiple):
        quotient = number_to_round // base_for_multiple
        if number_to_round % base_for_multiple != 0:
            quotient += 1
        return quotient * base_for_multiple        

    def uniquify(self, seq):
            seen = set()
            return [x for x in seq if not (x in seen or seen.add(x))]    
    
    # return a list of possible clusterings
    def suggest_clustering(self, first_k=1):
        raise NotImplemented()

In [97]:
class SingleTableMdcModel(AbstractModel):
    
    def __init__(self, query_frequencies, table_name, table_scans, table_size, distinct_values, target_chunksize, correlations, joins, sorted_columns_during_creation):
        super().__init__(query_frequencies, table_name, table_scans, correlations)
        self.table_size = table_size
        self.distinct_values = distinct_values
        self.target_chunksize = target_chunksize
        self.joins = joins
        self.sorted_columns_during_creation = sorted_columns_during_creation
    
    def suggest_clustering(self, first_k=1):
        interesting_columns = self.extract_interesting_columns()

        print(interesting_columns)
        
        clustering_columns = itertools.product(interesting_columns, interesting_columns)
        #clustering_columns = itertools.product(interesting_columns, interesting_columns, interesting_columns)
        clustering_columns = filter(lambda x: x[0] <= x[1], clustering_columns)
        #clustering_columns = filter(lambda x: x[1] <= x[2], clustering_columns)
        clustering_columns = [self.uniquify(clustering) for clustering in clustering_columns]
        sort_columns = interesting_columns        
        clusterings_with_runtimes = reduce(lambda x,y: x+y,[self.estimate_total_runtime(clustering_cols, sort_columns) for clustering_cols in clustering_columns])
        clusterings_with_runtimes.sort(key=lambda x: x[2], reverse=False)
        
        return clusterings_with_runtimes[0:first_k]
        
    def estimate_table_scan_runtimes(self, clustering_columns, sorting_columns, split_factors, total_runtimes):        
        def compute_unprunable_parts(row, split_factors):
            def clustering_columns_correlated_to(column):
                return [clustering_column for clustering_column in clustering_columns if column in self.correlations.get(clustering_column, {})]
            
            def correlates_to_clustering_column(column):
                return len(clustering_columns_correlated_to(column)) > 0

            column_name = row['COLUMN_NAME']

            if not row['useful_for_pruning']:
                selectivity = 1
            elif column_name in clustering_columns:
                scan_selectivity = row['selectivity']
                split_factor = split_factors[clustering_columns.index(column_name)]
                selectivity =  self.round_up_to_next_multiple(scan_selectivity, 1 / split_factor)
            elif correlates_to_clustering_column(column_name):
                scan_selectivity = row['selectivity']
                correlated_clustering_columns = clustering_columns_correlated_to(column_name)
                
                # ToDo this is hacky, but for now assume there is just one correlated column
                assert len(correlated_clustering_columns) == 1, f"expected just 1 correlated clustering column, but got {len(correlated_clustering_columns)}"
                
                split_factor = split_factors[clustering_columns.index(correlated_clustering_columns[0])]
                selectivity = min(1, 1.2 * self.round_up_to_next_multiple(scan_selectivity, 1 / split_factor))
            else:
                selectivity = 1
            
            return selectivity
        
        def compute_runtimes(row, sorting_column):
            assert row['estimated_input_rows'] > 1, row
            assert row['runtime_per_input_row'] > 0, row
            assert row['runtime_per_output_row'] > 0, row
            input_row_count = row['estimated_input_rows']
            
            if row['COLUMN_NAME'] == sorting_column and row['benefits_from_sorting']:
                # TODO is this the best way to simulate sorted access?
                input_row_count = np.log2(input_row_count)

            runtime = input_row_count * row['runtime_per_input_row'] + row['OUTPUT_ROWS'] * row['runtime_per_output_row']
            return runtime * self.query_frequency(row['QUERY_HASH'])
        
        scans_per_query = self.table_scans.sort_values(['INPUT_ROWS'], ascending=False).groupby(['QUERY_HASH', 'OPERATOR_POINTER'])
        for _, scans in scans_per_query:
            number_of_scans = len(scans)
            assert number_of_scans > 0 and number_of_scans < 25, f"weird scan length: {number_of_scans}\nScans:\n{scans}"
            # TODO: kinda unrealistic assumption: everything not in the table scan result can be pruned
                          
            unprunable_parts = scans.apply(compute_unprunable_parts, axis=1, args=(split_factors,))            
            unprunable_part = unprunable_parts.product()
            assert unprunable_part > 0, "no unprunable part"
            
            estimated_pruned_table_size = self.round_up_to_next_multiple(unprunable_part * self.table_size, CHUNK_SIZE)
            
            runtimes = pd.DataFrame()
            runtimes['QUERY_HASH'] = scans['QUERY_HASH']
            runtimes['runtime_per_input_row'] = scans['time_per_input_row']
            runtimes['runtime_per_output_row'] = scans['time_per_output_row']
            runtimes['COLUMN_NAME'] = scans['COLUMN_NAME']
            runtimes['benefits_from_sorting'] = scans['benefits_from_sorting']
            # the pruned table inputs should be reflected in 'estimated_input_rows'
            runtimes['estimated_input_rows'] = scans.apply(lambda x: x['INPUT_ROWS'], axis=1)
            runtimes['OUTPUT_ROWS'] = scans['OUTPUT_ROWS']

            runtimes.iloc[0, runtimes.columns.get_loc('estimated_input_rows')] = estimated_pruned_table_size                                    
            assert runtimes['estimated_input_rows'].iloc[0] == estimated_pruned_table_size, f"value is {runtimes.iloc[0]['estimated_input_rows']}, but should be {estimated_pruned_table_size}"
            # TODO modify input sizes of subsequent scans
            
            for sorting_column in sorting_columns:
                scan_runtimes = runtimes.apply(compute_runtimes, axis=1, args=(sorting_column,))
                total_runtimes[sorting_column] += scan_runtimes.sum()

    def estimate_join_runtimes(self, clustering_columns, sorting_columns, total_runtimes):                
        def estimate_join_runtime(row, sorting_column):
                        
            if "JoinHash" in row['DESCRIPTION']:
                probe_column = row['PROBE_COLUMN']
                if row['PROBE_TABLE'] == self.table_name:
                    probe_column_was_sorted = row['PROBE_SORTED'] and probe_column in self.sorted_columns_during_creation.get(self.table_name, {})
                    probe_column_is_sorted = row['PROBE_SORTED'] and probe_column == sorting_column
                    probe_column_is_clustered = row['PROBE_SORTED'] and probe_column in clustering_columns
                else:
                    probe_column_was_sorted = row['PROBE_SORTED'] and probe_column in self.sorted_columns_during_creation.get(row['PROBE_TABLE'], {})
                    probe_column_is_sorted = probe_column_was_sorted
                    probe_column_is_clustered = probe_column_was_sorted
                    
                build_column = row['BUILD_COLUMN']
                if row['BUILD_TABLE'] == self.table_name:
                    build_column_was_sorted = row['BUILD_SORTED'] and build_column in self.sorted_columns_during_creation.get(self.table_name, {})
                    build_column_is_sorted = row['BUILD_SORTED'] and build_column == sorting_column
                    build_column_is_clustered = row['BUILD_SORTED'] and build_column in clustering_columns
                else:
                    build_column_was_sorted = row['BUILD_SORTED'] and build_column in self.sorted_columns_during_creation.get(row['BUILD_TABLE'], {})
                    build_column_is_sorted = build_column_was_sorted
                    build_column_is_clustered = build_column_was_sorted

                time_materialize = row['MATERIALIZE']
                
                probe_weight = 2
                build_weight = 2
                if probe_column_was_sorted:
                    probe_weight = 1
                if build_column_was_sorted:
                    build_weight = 1
                
                
                probe_table_size = row['PROBE_TABLE_SIZE']
                build_table_size = row['BUILD_TABLE_SIZE']
                total_table_size = probe_weight * probe_table_size + build_weight * build_table_size
                
                time_materialize_probe = time_materialize * (probe_weight * probe_table_size / total_table_size)
                time_materialize_build = time_materialize - time_materialize_probe
                
                
                def get_materialize_factor(was_sorted, is_sorted, is_clustered):
                    materialize_factor = 1
                    if is_sorted and is_clustered:
                        if not was_sorted:
                            materialize_factor = 0.5
                        else:
                            materialize_factor = 1
                    elif is_sorted or is_clustered:
                        if not was_sorted:
                            materialize_factor = 0.55
                        else:
                            materialize_factor = 1.1
                    elif was_sorted:
                        # probe column is now neither sorted nor clustered
                        materialize_factor = 2
                    else:
                        # default case: was not sorted before, and is neither sorted nor clustered now. No change
                        materialize_factor = 1
                        
                    return materialize_factor
                
                materialize_probe_factor = get_materialize_factor(probe_column_was_sorted, probe_column_is_sorted, probe_column_is_clustered)
                materialize_build_factor = get_materialize_factor(build_column_was_sorted, build_column_is_sorted, build_column_is_clustered)
                
                time_materialize = time_materialize_probe * materialize_probe_factor + time_materialize_build *  materialize_build_factor
                

                # unchanged
                time_cluster = row['CLUSTER']
                
                # unchanged
                time_build = row['BUILD']
                
                            
                time_probe = row['PROBE']
                probe_factor = 1
                if probe_column_is_sorted and probe_column_is_clustered:
                    if not probe_column_was_sorted:
                        probe_factor = 0.7
                    else:
                        probe_factor = 1
                elif probe_column_is_sorted or probe_column_is_clustered:
                    if not probe_column_was_sorted:
                        probe_factor = 0.9
                    else:
                        probe_factor = 1.1
                elif probe_column_was_sorted:
                    # probe column is now neither sorted nor clustered
                    probe_factor = 1.4
                
                time_probe *= probe_factor                
                
                # unchanged
                time_write_output = row['WRITE_OUTPUT']
                
                
                
                # TODO: how to deal with the difference between RUNTIME_NS and sum(stage_runtimes)?
                runtime = time_materialize + time_cluster + time_build + time_probe + time_write_output
            else:
                runtime = row['RUNTIME_NS']
                
            return runtime * self.query_frequency(row['QUERY_HASH'])
        
        for sorting_column in sorting_columns:
            join_runtimes = self.joins.apply(estimate_join_runtime, axis=1, args=(sorting_column,))
            total_runtimes[sorting_column] += join_runtimes.sum()
                
    def estimate_total_runtime(self, clustering_columns, sorting_columns):
        #print(f"testing clustering {clustering_columns} with sorting columns {sorting_columns}")
        split_factors = self.determine_split_factors(clustering_columns)            
        total_runtimes = {sorting_column: 0 for sorting_column in sorting_columns}
        self.estimate_table_scan_runtimes(clustering_columns, sorting_columns, split_factors, total_runtimes)
        self.estimate_join_runtimes(clustering_columns, sorting_columns, total_runtimes)
        
        clusterings = [[list(zip(clustering_columns, split_factors)), sorting_column, np.int64(total_runtimes[sorting_column])] for sorting_column in sorting_columns]
        return clusterings
    
    def determine_split_factors(self, clustering_columns):
        approximate_split_factor = self.table_size / self.target_chunksize
        individual_distinct_values = [self.distinct_values[column] for column in clustering_columns]        
        log_distinct_values = [math.ceil(0.5+np.log2(x)) for x in individual_distinct_values]
        log_distinct_values_product = reduce(operator.mul, log_distinct_values, 1)
        assert log_distinct_values_product > 0, "cannot have a distinct value count of 0"
        
        global_modification_factor = approximate_split_factor / log_distinct_values_product
        num_dimensions = len(clustering_columns)
        individual_modification_factor = np.power(global_modification_factor, 1.0 / num_dimensions)    
        split_factors = [math.ceil(x * individual_modification_factor) for x in log_distinct_values]
        
        # testing
        actual_split_factor = reduce(operator.mul, split_factors, 1)
        assert actual_split_factor > 0, "there was a split up factor of 0"
        estimated_chunksize = self.table_size / actual_split_factor
        assert estimated_chunksize <= self.target_chunksize, "chunks should be smaller, not larger than target_chunksize"
        allowed_percentage = 0.55
        if estimated_chunksize < allowed_percentage * self.target_chunksize:
            print(f"Warning: chunks should not be too much smaller than target_chunksize: {estimated_chunksize} < {allowed_percentage} * {self.target_chunksize}")
        #assert estimated_chunksize >= allowed_percentage * self.target_chunksize, f"chunks should not be too much smaller than target_chunksize: {estimated_chunksize} < {allowed_percentage} * {self.target_chunksize}"
        
        return split_factors    

In [98]:
assert_correct_statistics_loaded()

def extract_single_table(table_scans, table_name):
    return table_scans[table_scans['TABLE_NAME'] == table_name]

def get_table_names(table_scans):
    return table_scans['TABLE_NAME'].unique()

def extract_probe_side_joins(joins, table_name):
    return joins[joins['PROBE_TABLE'] == table_name]


def default_benchmark_config():    
    if BENCHMARK == "TPCH":
        config = {
            'lineitem': [['l_shipdate', 92 * SCALE_FACTOR]],
            'orders': [['o_orderdate', 23 * SCALE_FACTOR]]
        }
    elif BENCHMARK == "TPCDS":
        config = dict()
    else:        
        raise Exception("unknown benchmark, please provide a default config")
    return config

def get_correlations():
    if BENCHMARK == "TPCH":
        correlations = {
            'lineitem': {
                'l_shipdate': ['l_receiptdate', 'l_commitdate'],
                'l_receiptdate': ['l_shipdate', 'l_commitdate'],
            }
        }
    elif BENCHMARK == "TPCDS":
        correlations = dict()
    else:
        raise Exception("unknown benchmark, please provide correlation information")
        
    return correlations


def format_table_clustering(clustering_config):
    # input format: List of [ [(column, split)+ ], sorting_column, runtime ]
    # output format: List of [ (column, split)+ ] - sorting column integrated if necessary
    
    assert len(clustering_config) == 3, "config should have exactly three entries: clustering columns, sort column, runtime"
    clustering_columns = clustering_config[0]
    assert len(clustering_columns) <= 3, "atm the model is at most 3-dimensional"
    #print(f"clustering columns are {clustering_columns}")
    last_clustering_column = clustering_columns[-1]
    last_clustering_column_name = last_clustering_column[0]
    #print(f"last column is {last_clustering_column_name}")
    sorting_column = clustering_config[1]
    #print(f"sort column is {sorting_column}")
    
    result = clustering_columns
    if last_clustering_column_name != sorting_column:
        result = clustering_columns + [(sorting_column, 1)]
        
    #print(f"in: {clustering_config}")
    #print(f"out: {result}")
    
    return result

def get_config_name(clustering_config):
    # Input: config-dict
    
    # List of lists. Each secondary list contains clustering information for a table
    table_configs = [clustering_config[table] for table in clustering_config]
    config_entries = [[f"{config_entry[0]}-{config_entry[1]}" for config_entry in config] for config in table_configs]
    table_entries = ["_".join(config) for config in config_entries]
    return "_".join(table_entries)


def create_benchmark_configs():
    
    start_time = datetime.now()
    clusterings = {"default" : default_benchmark_config()}
    query_frequencies = get_query_frequencies()
    
    distinct_values = get_distinct_values_count()
    joins = load_join_statistics()    
    sorted_columns_during_creation = get_sorted_columns_during_creation()
    correlations = get_correlations()
    table_names = get_table_names(scans)
    for table_name in table_names:
        start_time_table = datetime.now()
        single_table_scans = extract_single_table(scans, table_name)
        probe_side_joins = joins#extract_probe_side_joins(joins, table_name)
        table_size = table_sizes[table_name]
        if table_size <= 3 * CHUNK_SIZE:
            print(f"Not computing clustering for {table_name}, as it has only {table_size} rows")
            continue

        model = SingleTableMdcModel(query_frequencies, table_name, single_table_scans, table_size, distinct_values[table_name], CHUNK_SIZE, correlations.get(table_name, {}), probe_side_joins, sorted_columns_during_creation)
        table_clusterings = model.suggest_clustering(3)
        for table_clustering in table_clusterings:
            config = default_benchmark_config()
            config[table_name] = format_table_clustering(table_clustering)
            config_name = get_config_name(config)
            clusterings[config_name] = config
        end_time_table = datetime.now()
        print(f"Done computing clustering for {table_name} ({end_time_table - start_time_table})")

            
    end_time = datetime.now()
    print(f"Computed all clusterings in {end_time - start_time}")
    
    return clusterings

create_benchmark_configs()

# TODO:
#  joins costs are multiplied with 0
#  still, the model suggests some join columns - why? are they useful for pruning?

Not computing clustering for date_dim, as it has only 73049 rows
Not computing clustering for reason, as it has only 35 rows
['cd_gender', 'cd_marital_status', 'cd_education_status', 'cd_demo_sk']
Done computing clustering for customer_demographics (0:00:02.177858)
Not computing clustering for customer_address, as it has only 50000 rows
Not computing clustering for household_demographics, as it has only 7200 rows
Not computing clustering for store, as it has only 12 rows
Not computing clustering for item, as it has only 18000 rows
Not computing clustering for time_dim, as it has only 86400 rows
Not computing clustering for promotion, as it has only 300 rows
['ss_sold_date_sk', 'ss_ticket_number', 'ss_hdemo_sk', 'ss_store_sk', 'ss_customer_sk', 'ss_item_sk', 'ss_addr_sk', 'ss_cdemo_sk', 'ss_sold_time_sk', 'ss_promo_sk']
Done computing clustering for store_sales (0:00:23.430186)
Computed all clusterings in 0:00:25.879292


{'default': {},
 'cd_demo_sk-13_cd_education_status-3': {'customer_demographics': [('cd_demo_sk',
    13),
   ('cd_education_status', 3)]},
 'cd_demo_sk-18_cd_gender-2_cd_education_status-1': {'customer_demographics': [('cd_demo_sk',
    18),
   ('cd_gender', 2),
   ('cd_education_status', 1)]},
 'cd_demo_sk-15_cd_marital_status-2_cd_education_status-1': {'customer_demographics': [('cd_demo_sk',
    15),
   ('cd_marital_status', 2),
   ('cd_education_status', 1)]},
 'ss_sold_date_sk-6_ss_ticket_number-9_ss_item_sk-1': {'store_sales': [('ss_sold_date_sk',
    6),
   ('ss_ticket_number', 9),
   ('ss_item_sk', 1)]},
 'ss_item_sk-8_ss_sold_date_sk-6_ss_ticket_number-1': {'store_sales': [('ss_item_sk',
    8),
   ('ss_sold_date_sk', 6),
   ('ss_ticket_number', 1)]},
 'ss_item_sk-6_ss_ticket_number-8_ss_sold_date_sk-1': {'store_sales': [('ss_item_sk',
    6),
   ('ss_ticket_number', 8),
   ('ss_sold_date_sk', 1)]}}

In [99]:
len(scans[scans['TABLE_NAME'] == 'orders'])

0

Outdated code fragments (older model versions) are kept below.

In [100]:
class SimpleModel(AbstractModel):
    
    def __init__(self, table_scans, correlations = {}):
        super().__init__(table_scans, correlations)        
    
    def suggest_clustering(self, first_k=1):
        interesting_columns = self.extract_interesting_columns()

        pairs = itertools.product(interesting_columns, interesting_columns)                
        total_runtimes = [self.estimate_total_runtime(self.table_scans, clustering_columns) for clustering_columns in pairs]
        total_runtimes.sort(key=lambda x: x[1], reverse=False)
        
        return total_runtimes[0:first_k]
        
    
    def estimate_total_runtime(self, single_table, clustering_columns):
        total_runtime = 0
        
        pruning_col = clustering_columns[0]
        sorted_col = clustering_columns[1]
        def compute_runtime(row):
            col_name = row['COLUMN_NAME']
            if pruning_col == sorted_col:
                if col_name == pruning_col:
                    return row['optimal_log_runtime']
                else:
                    if col_name in self.correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

            else:
                if col_name == pruning_col:
                    return row['optimal_runtime']
                elif col_name == sorted_col:
                    # TODO: should this be affected by correlation?
                    # we will get less chunks, so a linear scan should be close to optimal_runtime,
                    # but log time should beat it anyway
                    return row['log_runtime']
                else:
                    if col_name in self.correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']
                    
        effective_runtime = single_table.apply(compute_runtime, axis=1)
        return [clustering_columns, effective_runtime.sum()]

In [101]:
# Store additional statistics
# TODO keep?

assert_correct_statistics_loaded()

def round_up_to_chunksize(row):
    if row['OUTPUT_ROWS'] % CHUNK_SIZE == 0:
        return row['OUTPUT_ROWS']
    else:
        return row['OUTPUT_ROWS'] + (CHUNK_SIZE - (row['OUTPUT_ROWS'] % CHUNK_SIZE))

scans['pruned_minimum_input_rows'] = scans.apply(round_up_to_chunksize, axis=1)

scans['selectivity'] = scans['OUTPUT_ROWS'] / scans['INPUT_ROWS']
scans['actual_selectivity'] = scans['SINGLE_OUTPUT_ROWS'] / scans['SINGLE_INPUT_ROWS']

scans['time_per_ir'] = scans['RUNTIME_NS'] / scans['INPUT_ROWS']
scans['time_per_or'] = scans['RUNTIME_NS'] / scans['OUTPUT_ROWS']

# optimal runtime assuming perfect pruning, but not sortedness
scans['optimal_runtime'] = scans['time_per_ir'] * scans['pruned_minimum_input_rows']
scans['runtime_gain'] = scans['RUNTIME_NS'] - scans['optimal_runtime']


# log runtime for sorted columns
scans['log_runtime'] = np.log2(scans['RUNTIME_NS'])
scans['optimal_log_runtime'] = np.log2(1+scans['optimal_runtime'])
scans

Unnamed: 0,QUERY_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION,SINGLE_INPUT_ROWS,SINGLE_OUTPUT_ROWS,...,benefits_from_sorting,useful_for_pruning,pruned_minimum_input_rows,actual_selectivity,time_per_ir,time_per_or,optimal_runtime,runtime_gain,log_runtime,optimal_log_runtime
0,8a500b8350e13fe5,DATA,date_dim,d_month_seq,73049,366,152194,TableScan Impl: ColumnBetween d_month_seq BETW...,73049,366,...,True,True,65535,0.005010,2.083451,415.830601,1.365390e+05,1.565505e+04,17.215552,17.058964
1,e045c72cfee071ed,DATA,date_dim,d_month_seq,73049,366,140251,TableScan Impl: ColumnBetween d_month_seq BETW...,73049,366,...,True,True,65535,0.005010,1.919958,383.199454,1.258244e+05,1.442656e+04,17.097652,16.941064
2,65cf99f6be86c7f4,DATA,reason,r_reason_desc,35,1,49172,TableScan Impl: ColumnVsValue r_reason_desc = ...,35,1,...,True,True,65535,0.028571,1404.914286,49172.000000,9.207106e+07,-9.202189e+07,15.585549,26.456244
3,c86892b6d2937a1c,REFERENCE,date_dim,d_moy,365,30,15576,TableScan Impl: ColumnVsValue d_moy = 11,73049,6000,...,True,True,65535,0.082137,42.673973,519.200000,2.796639e+06,-2.781063e+06,13.927037,21.415263
4,c86892b6d2937a1c,DATA,date_dim,d_year,73049,365,108750,TableScan Impl: ColumnVsValue d_year = 1998,73049,365,...,True,True,65535,0.004997,1.488727,297.945205,9.756371e+04,1.118629e+04,16.730656,16.574072
5,c86892b6d2937a1c,REFERENCE,customer_demographics,cd_marital_status,274400,54880,1477447,TableScan Impl: ColumnVsValue cd_marital_statu...,1920800,384160,...,True,False,65535,0.200000,5.384282,26.921410,3.528589e+05,1.124588e+06,20.494675,18.428736
6,c86892b6d2937a1c,REFERENCE,customer_demographics,cd_education_status,1920800,274400,4735751,TableScan Impl: ColumnVsValue cd_education_sta...,1920800,274400,...,True,False,327675,0.142857,2.465510,17.258568,8.078859e+05,3.927865e+06,22.175162,19.623794
7,c86892b6d2937a1c,REFERENCE,customer_demographics,cd_marital_status,274400,54880,969948,TableScan Impl: ColumnVsValue cd_marital_statu...,1920800,384160,...,True,False,65535,0.200000,3.534796,17.673980,2.316529e+05,7.382951e+05,19.887548,17.821611
8,c86892b6d2937a1c,REFERENCE,customer_demographics,cd_education_status,1920800,274400,4318423,TableScan Impl: ColumnVsValue cd_education_sta...,1920800,274400,...,True,False,327675,0.142857,2.248242,15.737693,7.366927e+05,3.581730e+06,22.042073,19.490705
9,c86892b6d2937a1c,DATA,customer_address,ca_gmt_offset,50000,5449,127748,TableScan Impl: ColumnVsValue ca_gmt_offset = -7,50000,5449,...,True,True,65535,0.108980,2.554960,23.444302,1.674393e+05,-3.969130e+04,16.962941,17.353287


In [102]:
GAIN_COLUMN = 'runtime_gain'

scans_groupby_columnname = scans.groupby(['TABLE_NAME', 'COLUMN_NAME'])
sum_of_gains = pd.DataFrame(scans_groupby_columnname[GAIN_COLUMN].sum())
sum_of_gains.sort_values(by=['TABLE_NAME', GAIN_COLUMN], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,runtime_gain
TABLE_NAME,COLUMN_NAME,Unnamed: 2_level_1
customer_address,ca_gmt_offset,-39691.3
customer_address,ca_country,-305703.0
customer_address,ca_state,-856865.6
customer_address,ca_county,-2504912.0
customer_demographics,cd_education_status,47690110.0
customer_demographics,cd_marital_status,12014160.0
customer_demographics,cd_gender,-396349.7
date_dim,d_year,434619.2
date_dim,d_date,64552.78
date_dim,d_month_seq,58776.54


In [103]:
assert_correct_statistics_loaded()

if BENCHMARK == "TPCH":
    TABLE = "lineitem"
else:    
    TABLE = "customer_demographics"

import itertools

def extract_single_table(table_name):
    return scans[scans['TABLE_NAME'] == table_name]

def extract_interesting_columns(df):
    return list(df['COLUMN_NAME'].unique())


correlations = {
    'l_shipdate': ['l_receiptdate', 'l_commitdate'],
    'l_receiptdate': ['l_shipdate', 'l_commitdate'],
    'l_commitdate': ['l_receiptdate', 'l_shipdate']
}
#correlations = {}
def table_sorting_options(table_name):
    single_table = extract_single_table(table_name)
    interesting_cols = extract_interesting_columns(single_table)
    pairs = itertools.product(interesting_cols, interesting_cols)
    
    total_times = []
    for pair in pairs:
        pruning_col = pair[0]
        sorted_col = pair[1]

        def compute_runtime(row):
            col_name = row['COLUMN_NAME']
            if pruning_col == sorted_col:
                if col_name == pruning_col:
                    return row['optimal_log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

            else:
                if col_name == pruning_col:
                    return row['optimal_runtime']
                elif col_name == sorted_col:
                    # TODO: should this be affected by correlation?
                    # we will get less chunks, so a linear scan should be close to optimal_runtime,
                    # but log time should beat it anyway
                    return row['log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

        effective_runtime = single_table.apply(compute_runtime, axis=1)
        total_times.append([pair, effective_runtime.sum()])    
    total_times = pd.DataFrame(total_times, columns=['columns', 'time'])    
    return total_times

options = table_sorting_options(TABLE)
options.sort_values(by=['time'], ascending=True)

Unnamed: 0,columns,time
1,"(cd_marital_status, cd_education_status)",5811389.0
3,"(cd_education_status, cd_marital_status)",11850650.0
4,"(cd_education_status, cd_education_status)",17825520.0
7,"(cd_gender, cd_education_status)",18221900.0
5,"(cd_education_status, cd_gender)",25592790.0
0,"(cd_marital_status, cd_marital_status)",59540730.0
6,"(cd_gender, cd_marital_status)",59937110.0
2,"(cd_marital_status, cd_gender)",61268730.0
8,"(cd_gender, cd_gender)",73282900.0


In [104]:
aggregates = pd.read_csv(f"{STATISTICS_PATH}/aggregates.csv", sep=',')

# it looks like column names are mixed up.
# COLUMN_NAME -> actually GROUP_BY_COLUMN_COUNT
# GROUP_BY_COLUMN_COUNT -> actually AGGREGATE_COLUMN_COUNT
# AGGREGATE_COLUMN_COUNT -> actually COLUMN_NAME

COL_NAME = 'AGGREGATE_COLUMN_COUNT'
GROUPBY_COL = 'COLUMN_NAME'
AGG_COL = 'GROUP_BY_COLUMN_COUNT'

# All aggregates have to read the entire table, so we cannot skip chunks.
# But getting all groups consecutive could provide a speedup
# As a result, we care only about aggregates with group by columns

interesting_aggregates = aggregates[aggregates[GROUPBY_COL] > 0]
stats = interesting_aggregates.groupby(['TABLE_NAME', COL_NAME])
out_columns = pd.DataFrame(stats['OUTPUT_ROWS'].max())
out_columns.sort_values(by=['TABLE_NAME', 'OUTPUT_ROWS'], ascending=[True, False])
aggregates[aggregates['COLUMN_TYPE'] == 'DATA']

Unnamed: 0,QUERY_HASH,AGGREGATE_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,GROUP_BY_COLUMN_COUNT,AGGREGATE_COLUMN_COUNT,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION


In [105]:
scan_time_per_column = scans.groupby(['COLUMN_NAME'])
accumulated_scan_times = pd.DataFrame(scan_time_per_column['RUNTIME_NS'].sum())
total_scan_runtime = accumulated_scan_times['RUNTIME_NS'].sum()
assert total_scan_runtime == scans['RUNTIME_NS'].sum(), f"{total_scan_runtime}, {scans['RUNTIME_NS'].sum()}"
print(f"total scan runtime: {total_scan_runtime}")

scan_time_per_column_prunable = scans[scans['useful_for_pruning']].groupby(['COLUMN_NAME'])
accumulated_prunable_scan_times = pd.DataFrame(scan_time_per_column_prunable['RUNTIME_NS'].sum())
total_prunable_scan_runtime = accumulated_prunable_scan_times['RUNTIME_NS'].sum()
print(f"total prunable scan runtime: {total_prunable_scan_runtime}")
print(f"{100*total_prunable_scan_runtime/total_scan_runtime}% of scan runtime amount to prunable scans")

accumulated_scan_times.sort_values(['RUNTIME_NS'], ascending=False)

total scan runtime: 173950691
total prunable scan runtime: 16193806
9.309423208902343% of scan runtime amount to prunable scans


Unnamed: 0_level_0,RUNTIME_NS
COLUMN_NAME,Unnamed: 1_level_1
cd_education_status,57499044
ss_quantity,31666385
ss_net_profit,20790286
cd_marital_status,15783812
ss_wholesale_cost,9447825
ss_coupon_amt,8500996
ca_county,8062155
ss_list_price,6831822
d_year,4225246
ca_state,2473654


In [106]:
joins = load_join_statistics()

print(joins['PROBE_COLUMN'].unique())

join_time_per_column = joins.groupby(['PROBE_COLUMN'])

accumulated_join_times = pd.DataFrame(join_time_per_column['RUNTIME_NS'].sum())
print(len(accumulated_join_times))
total_join_runtime = accumulated_join_times['RUNTIME_NS'].sum()
#assert total_join_runtime == joins['RUNTIME_NS'].sum(), f"{total_join_runtime},{joins['RUNTIME_NS'].sum()}"
print(f"total join runtime: {total_join_runtime}")

joins[joins.apply(lambda x : x['PROBE_COLUMN'] not in ['o_custkey' ,'n_nationkey' ,'s_nationkey' ,'l_suppkey', 's_suppkey',
 'l_orderkey', 'o_orderkey', 'p_partkey' ,'l_partkey' ,'ps_suppkey',
 'c_nationkey' ,'r_regionkey' ,'c_custkey' ,'ps_partkey'] ,axis=1)]

print(f"for {BENCHMARK}, joins take about {total_join_runtime / total_scan_runtime} times longer than table scans")
accumulated_join_times.sort_values(['RUNTIME_NS'], ascending=False)

['cs_ship_date_sk' 'cs_warehouse_sk' 'cs_ship_mode_sk' 'cs_call_center_sk'
 'ss_sold_date_sk' 'cs_sold_date_sk' nan 'ss_ticket_number' 'sr_reason_sk'
 'cr_returned_date_sk' 'c_current_addr_sk' 'c_current_hdemo_sk'
 'ca_address_sk' 'cd_demo_sk' 'c_current_cdemo_sk'
 'cr_returning_customer_sk' 'c_customer_sk' 'd_date_sk'
 'cc_call_center_sk' 'wr_refunded_addr_sk' 'wr_reason_sk'
 'ws_sold_date_sk' 'ws_web_page_sk' 'wr_order_number' 'ws_order_number'
 'cr_returning_addr_sk' 'ca_state' 'ss_hdemo_sk' 'ss_store_sk'
 'ss_customer_sk' 'ss_item_sk' 'sr_returned_date_sk' 'ss_addr_sk'
 'ss_cdemo_sk' 'cs_bill_customer_sk' 'i_item_sk' 'cs_ship_customer_sk'
 'ws_ship_date_sk' 'ws_warehouse_sk' 'ws_web_site_sk' 'ws_ship_mode_sk'
 'ws_bill_customer_sk' 'ws_item_sk' 'i_item_id' 'ss_sold_time_sk'
 't_time_sk' 'cs_promo_sk' 'cs_bill_cdemo_sk' 'hd_demo_sk' 'sr_store_sk'
 'ss_promo_sk' 's_store_sk' 'ca_county' 'ws_bill_addr_sk' 'd_week_seq'
 'd_date' 'wr_returned_date_sk' 'd_month_seq' 'i_category' 'inv_dat

Unnamed: 0_level_0,RUNTIME_NS
PROBE_COLUMN,Unnamed: 1_level_1
ss_sold_date_sk,616827686
ss_ticket_number,544841337
inv_date_sk,217672730
ss_sold_time_sk,167653057
cs_sold_date_sk,118667504
cd_demo_sk,107854544
ss_store_sk,96261621
c_customer_sk,78990087
inv_item_sk,50915873
ca_address_sk,49668151
