In [1]:
# Config

BENCHMARK = "TPCH"

if BENCHMARK == "TPCH":
    SCALE_FACTOR = 1
    STATISTICS_PATH = f"TPC-H__SF_{SCALE_FACTOR}.000000__RUNS_10"
    CHUNK_SIZE = 25000
elif BENCHMARK == "TPCDS":
    SCALE_FACTOR = 1
    STATISTICS_PATH = f"TPC-DS__SF_{SCALE_FACTOR}.000000__RUNS_1"
    CHUNK_SIZE = 65000
else:
    raise Exception("Unknown benchmark: " + BENCHMARK)
    
import pandas as pd
import numpy as np
from functools import reduce
from collections import Counter
import itertools
import math
import operator
import json

print(f"Model is configured for {BENCHMARK} with scale factor {SCALE_FACTOR} and a chunk size of {CHUNK_SIZE}")

Model is configured for TPCH with scale factor 1 and a chunk size of 25000


In [2]:
# Load table scan statistics

path = f"{STATISTICS_PATH}/table_scans.csv"
scans = pd.read_csv(path, sep='|')
EXPECTED_SCAN_COUNT = len(scans)
LOADED_BENCHMARK = BENCHMARK
LOADED_SCALE_FACTOR = SCALE_FACTOR
LOADED_CHUNK_SIZE = CHUNK_SIZE
print(f"Successfully loaded {path}")

def assert_correct_statistics_loaded():
    assert BENCHMARK == LOADED_BENCHMARK, f"The model is configured to use {BENCHMARK}, but {LOADED_BENCHMARK} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert SCALE_FACTOR == LOADED_SCALE_FACTOR, f"The model is configured to use {SCALE_FACTOR} as scale factor, but data for a scale factor of {LOADED_SCALE_FACTOR} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert CHUNK_SIZE == LOADED_CHUNK_SIZE, f"The model is configured to use {CHUNK_SIZE} as chunk_size, but data for a chunk size of {LOADED_CHUNK_SIZE} is currently loaded.\nEither change the benchmark or re-run all cells"
    assert EXPECTED_SCAN_COUNT == len(scans), f"There should be {EXPECTED_SCAN_COUNT} table scans, but there are only {len(scans)}\nProbably one of the last commands reassigned it unintentionally"
    
    assert 'OPERATOR_POINTER' in scans.columns, f"the statistics in {STATISTICS_PATH} are outdated (column 'OPERATOR_POINTER' in table_scans.csv is missing). Please create them again."

Successfully loaded TPC-H__SF_1.000000__RUNS_10/table_scans.csv


In [3]:
# Validate table scans
assert_correct_statistics_loaded()

# To make sure pruning was not active,
# first fetch table sizes,
table_statistics = pd.read_csv(f"{STATISTICS_PATH}/table_meta_data.csv", sep='|')
table_sizes = dict(zip(table_statistics.table_name, table_statistics.row_count))

# then make sure INPUT_ROWS == table_size
def input_size_matches(row):
    #print(row)
    
    actual_row_count = row['INPUT_ROWS']
    table = row['TABLE_NAME']
    expected_row_count = table_sizes[table]
    return expected_row_count == actual_row_count

data_scans = scans[scans['COLUMN_TYPE'] == 'DATA']
input_size_matches = data_scans.apply(input_size_matches, axis=1)
all_sizes_match = reduce(np.logical_and, input_size_matches) #input_size_matches.apply()

if not all_sizes_match:
    raise Exception("The given statistics were probably created while pruning was active")
else:
    print("OK - looks like pruning was deactivated while the statistics were created")

OK - looks like pruning was deactivated while the statistics were created


In [4]:
# Cleanse table scans
assert_correct_statistics_loaded()

print(f"Statistics for {BENCHMARK} contain {len(scans)} table scans")

# If there is only one chunk, we cannot really prune
# NOTE: this check was moved to the caller of the model
#scans = scans[scans['INPUT_ROWS'] > CHUNK_SIZE]
#print(f"Of those, only {len(scans)} operate on more than {CHUNK_SIZE} tuples")



def determine_or_chains(table_scans):
    table_scans['part_of_or_chain'] = False
    
    single_table_scans = table_scans.groupby(['QUERY_HASH', 'TABLE_NAME', 'OPERATOR_POINTER'])
    
    for _, scans in single_table_scans:            
        input_row_frequencies = Counter(scans.INPUT_ROWS)
        or_input_sizes = set([input_size for input_size, frequency in input_row_frequencies.items() if frequency > 1])

        df = pd.DataFrame()
        df['INPUT_ROWS'] = scans['INPUT_ROWS']
        df['OUTPUT_ROWS'] = scans['OUTPUT_ROWS']
        df['part_of_or_chain'] = scans.apply(lambda row: row['INPUT_ROWS'] in or_input_sizes, axis=1)

        for _ in range(len(scans)):
            or_input_sizes |= set(df[df['part_of_or_chain']].OUTPUT_ROWS.unique())
            df['part_of_or_chain'] = df.apply(lambda row: row['INPUT_ROWS'] in or_input_sizes, axis=1)

        or_chains = list(df[df['part_of_or_chain']].index)
        table_scans.iloc[or_chains, table_scans.columns.get_loc('part_of_or_chain')] = True
    
    return table_scans

scans = determine_or_chains(scans)


# Like scans are not useful if they start with %
# TODO what if they dont start with % and contain more than one % ? -> up to first % prunable, but is it used?
def is_useful(row):    
    description = row['DESCRIPTION']
    if row['part_of_or_chain']:
        return False
    elif "ColumnLike" in description:
        words = description.split()
        like_criteria = words[-1]
        assert "%" in like_criteria, f"LIKE operators should have an %, but found none in {like_criteria}"
        return like_criteria[1] != '%'
    elif "ExpressionEvaluator" in description and " IN " in description:
        return False
    else:
        return True
    
#scans = scans[scans.apply(is_useful, axis=1)]
scans['useful_for_pruning'] = scans.apply(is_useful, axis=1)
EXPECTED_SCAN_COUNT = len(scans)
print(f"Of those, only {len(scans[scans['useful_for_pruning']])} are useful for pruning")

print("TODO: For now, filtering on scans is deactivated. This is because all scans are needed to recognize OR-Chains. Models have to take care themselves whether a scan can contribute to pruning or not")

Statistics for TPCH contain 436 table scans
Of those, only 228 are useful for pruning
TODO: For now, filtering on scans is deactivated. This is because all scans are needed to recognize OR-Chains. Models have to take care themselves whether a scan can contribute to pruning or not


In [5]:
def test_determine_or_chains():
    test = pd.DataFrame()
    test['QUERY_HASH'] = pd.Series(['1']*3  + ['2']*4)
    test['TABLE_NAME'] = pd.Series(['lineitem']*3  + ['part']*4)
    test['OPERATOR_POINTER'] = pd.Series(['0x1'] + ['0x2']*2 + ['0x3']*4)
    test['COLUMN_NAME'] = pd.Series(['l_shipdate', 'l_shipdate', 'l_discount', 'p_brand', 'p_type', 'p_type', 'p_size'])
    test['INPUT_ROWS'] = pd.Series( [6001215, 6001215, 200000, 200000, 199000, 199000, 50000])
    test['OUTPUT_ROWS'] = pd.Series([ 400000,  300000, 200000, 199000,      0,  50000, 20000])
    test_result = determine_or_chains(test)
    assert len(test_result) == 7, "should not filter out any rows"    
    assert len(test_result[test_result['part_of_or_chain']]) == 3, "expected 3 scans, got\n" + str(test_result)
    assert list(test_result['part_of_or_chain']) == [False]*4 + [True] * 3
    print("Test OK")

test_determine_or_chains()

Test OK


In [6]:
(scans['RUNTIME_NS'] - scans['SINGLE_RUNTIME_NS']).max()
# TODO can the actual runtime be that much greater than the runtime on the original table?

17026925

In [7]:
# Store additional statistics
# TODO keep?

assert_correct_statistics_loaded()

def round_up_to_chunksize(row):
    if row['OUTPUT_ROWS'] % CHUNK_SIZE == 0:
        return row['OUTPUT_ROWS']
    else:
        return row['OUTPUT_ROWS'] + (CHUNK_SIZE - (row['OUTPUT_ROWS'] % CHUNK_SIZE))

scans['pruned_minimum_input_rows'] = scans.apply(round_up_to_chunksize, axis=1)

scans['selectivity'] = scans['OUTPUT_ROWS'] / scans['INPUT_ROWS']
scans['actual_selectivity'] = scans['SINGLE_OUTPUT_ROWS'] / scans['SINGLE_INPUT_ROWS']

scans['time_per_ir'] = scans['RUNTIME_NS'] / scans['INPUT_ROWS']
scans['time_per_or'] = scans['RUNTIME_NS'] / scans['OUTPUT_ROWS']

# optimal runtime assuming perfect pruning, but not sortedness
scans['optimal_runtime'] = scans['time_per_ir'] * scans['pruned_minimum_input_rows']
scans['runtime_gain'] = scans['RUNTIME_NS'] - scans['optimal_runtime']


# log runtime for sorted columns
scans['log_runtime'] = np.log2(scans['RUNTIME_NS'])
scans['optimal_log_runtime'] = np.log2(1+scans['optimal_runtime'])
scans

Unnamed: 0,QUERY_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION,SINGLE_INPUT_ROWS,SINGLE_OUTPUT_ROWS,...,useful_for_pruning,pruned_minimum_input_rows,selectivity,actual_selectivity,time_per_ir,time_per_or,optimal_runtime,runtime_gain,log_runtime,optimal_log_runtime
0,bac00c0bdbf62ea,REFERENCE,customer,c_phone,68045,19042,12095326,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,41912,...,False,25000,0.279844,0.279413,177.754809,635.191997,4.443870e+06,7.651456e+06,23.527946,22.083386
1,2687bf4da454552b,REFERENCE,customer,c_phone,68036,18961,12076538,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,41865,...,False,25000,0.278691,0.279100,177.502175,636.914614,4.437554e+06,7.638984e+06,23.525704,22.081334
2,2bd757c748d34189,REFERENCE,customer,c_phone,67946,19133,12057470,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,41972,...,False,25000,0.281591,0.279813,177.456657,630.192338,4.436416e+06,7.621054e+06,23.523424,22.080964
3,7ce8aa4cc8eabfd8,REFERENCE,customer,c_phone,68204,18942,12351032,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,41801,...,False,25000,0.277726,0.278673,181.089555,652.044768,4.527239e+06,7.823793e+06,23.558128,22.110200
4,3bf533ddc6f54ed4,REFERENCE,customer,c_phone,67817,19151,12183192,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,42096,...,False,25000,0.282392,0.280640,179.648053,636.164796,4.491201e+06,7.691991e+06,23.538389,22.098670
5,74335a369db42f54,REFERENCE,customer,c_phone,68049,19164,12594933,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,42083,...,False,25000,0.281621,0.280553,185.086232,657.218378,4.627156e+06,7.967777e+06,23.586340,22.141695
6,aeebd3c094273d3e,REFERENCE,customer,c_phone,68034,19072,12190876,TableScan Impl: ExpressionEvaluator (SUBSTR(c_...,150000,42281,...,False,25000,0.280330,0.281873,179.187994,639.202810,4.479700e+06,7.711176e+06,23.539298,22.094971
7,6730c267d3eac48a,DATA,orders,o_orderstatus,1500000,729413,8259119,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,...,True,750000,0.486275,0.486275,5.506079,11.322967,4.129560e+06,4.129560e+06,22.977556,21.977557
8,6730c267d3eac48a,DATA,nation,n_name,25,1,13829,TableScan Impl: ColumnVsValue n_name = 'IRAN',25,1,...,True,25000,0.040000,0.040000,553.160000,13829.000000,1.382900e+07,-1.381517e+07,13.755409,23.721194
9,6ec3126b032024be,DATA,orders,o_orderstatus,1500000,729413,8211345,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,...,True,750000,0.486275,0.486275,5.474230,11.257470,4.105673e+06,4.105672e+06,22.969187,21.969187


In [8]:
# Load column statistics - especially interesting: number of distinct values

# Returns a 2-level-dictionary: distinct_values[TABLE][COLUMN] = number_of_distinct_values
def get_distinct_values_count():        
    # Code
    column_statistics_df = pd.read_csv(f"{STATISTICS_PATH}/column_meta_data.csv", sep='|')
    column_statistics_df['distinct_values'] = np.int32(column_statistics_df['distinct_values'])
    tables_and_columns = column_statistics_df.groupby('table_name')
    distinct_values = {table: dict(zip(column_df.column_name, column_df.distinct_values)) for table, column_df in tables_and_columns }

    
    # Test
    num_tables = len(distinct_values)
    if BENCHMARK == "TPCH":
        assert num_tables == 8, f"TPCH has 8 tables, but got {num_tables}"
    elif BENCHMARK == "TPCDS":
        assert num_tables == 24, f"TPCDS has 24 tables, but got {num_tables}"
    else:
        assert False, "Insert a benchmark specific check here"
    
    return distinct_values

In [9]:
class AbstractModel:
    
    def __init__(self, table_scans, correlations={}):
        self.table_scans = table_scans
        self.correlations = correlations
        
    def extract_interesting_columns(self):
        return list(self.table_scans['COLUMN_NAME'].unique())
    
    def round_up_to_next_multiple(self, number_to_round, base_for_multiple):
        quotient = number_to_round // base_for_multiple
        if number_to_round % base_for_multiple != 0:
            quotient += 1
        return quotient * base_for_multiple        

    # return a list of possible clusterings
    def suggest_clustering(self, first_k=1):
        raise NotImplemented()

In [10]:
class SimpleModel(AbstractModel):
    
    def __init__(self, table_scans, correlations = {}):
        super().__init__(table_scans, correlations)        
    
    def suggest_clustering(self, first_k=1):
        interesting_columns = self.extract_interesting_columns()

        pairs = itertools.product(interesting_columns, interesting_columns)                
        total_runtimes = [self.estimate_total_runtime(self.table_scans, clustering_columns) for clustering_columns in pairs]
        total_runtimes.sort(key=lambda x: x[1], reverse=False)
        
        return total_runtimes[0:first_k]
        
    
    def estimate_total_runtime(self, single_table, clustering_columns):
        total_runtime = 0
        
        pruning_col = clustering_columns[0]
        sorted_col = clustering_columns[1]
        def compute_runtime(row):
            col_name = row['COLUMN_NAME']
            if pruning_col == sorted_col:
                if col_name == pruning_col:
                    return row['optimal_log_runtime']
                else:
                    if col_name in self.correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

            else:
                if col_name == pruning_col:
                    return row['optimal_runtime']
                elif col_name == sorted_col:
                    # TODO: should this be affected by correlation?
                    # we will get less chunks, so a linear scan should be close to optimal_runtime,
                    # but log time should beat it anyway
                    return row['log_runtime']
                else:
                    if col_name in self.correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']
                    
        effective_runtime = single_table.apply(compute_runtime, axis=1)
        return [clustering_columns, effective_runtime.sum()]

In [11]:
class SingleTableMdcModel(AbstractModel):
    
    def __init__(self, table_scans, table_size, distinct_values, target_chunksize, correlations = {}):
        super().__init__(table_scans, correlations)
        self.table_size = table_size
        self.distinct_values = distinct_values
        self.target_chunksize = target_chunksize
    
    def suggest_clustering(self, first_k=1):
        interesting_columns = self.extract_interesting_columns()
        
        def unique(seq):
            seen = set()
            return [x for x in seq if not (x in seen or seen.add(x))]

        clustering_columns = itertools.product(interesting_columns, interesting_columns)
        clustering_columns = filter(lambda x: x[0] <= x[1], clustering_columns)
        clustering_columns = [unique(clustering) for clustering in clustering_columns]
        sort_columns = interesting_columns        
        clusterings_with_runtimes = reduce(lambda x,y: x+y,[self.estimate_total_runtime(clustering_columns, sort_columns) for clustering_columns in clustering_columns])
        clusterings_with_runtimes.sort(key=lambda x: x[2], reverse=False)
        
        return clusterings_with_runtimes[0:first_k]
        
    
    def estimate_total_runtime(self, clustering_columns, sorting_columns):
        #print(f"testing clustering {clustering_columns} with sorting columns {sorting_columns}")
        split_factors = self.determine_split_factors(clustering_columns)
        
        def compute_unprunable_parts(row, split_factors): #TODO
            def clustering_columns_correlated_to(column):
                return [clustering_column for clustering_column in clustering_columns if column in self.correlations.get(clustering_column, {})]
            
            def correlates_to_clustering_column(column):
                return len(clustering_columns_correlated_to(column)) > 0

            column_name = row['COLUMN_NAME']

            if not row['useful_for_pruning']:
                selectivity = 1
            elif column_name in clustering_columns:
                scan_selectivity = row['selectivity']
                split_factor = split_factors[clustering_columns.index(column_name)]
                selectivity =  self.round_up_to_next_multiple(scan_selectivity, 1 / split_factor)
            elif correlates_to_clustering_column(column_name):
                scan_selectivity = row['selectivity']
                correlated_clustering_columns = clustering_columns_correlated_to(column_name)
                
                # ToDo this is hacky, but for now assume there is just one correlated column
                assert len(correlated_clustering_columns) == 1, f"expected just 1 correlated clustering column, but got {len(correlated_clustering_columns)}"
                
                split_factor = split_factors[clustering_columns.index(correlated_clustering_columns[0])]
                selectivity = min(1, 1.2 * self.round_up_to_next_multiple(scan_selectivity, 1 / split_factor))
            else:
                selectivity = 1
            
            return selectivity                
            
        def compute_runtimes(row, sorting_column):
            # TODO what about correlations?
            assert row['estimated_input_rows'] > 0, row
            assert row['runtime_per_row'] > 0, row
            row_count = row['estimated_input_rows']
            
            if row['COLUMN_NAME'] == sorting_column:
                # TODO is this the best way to simulate sorted access?
                row_count = np.log2(row_count)
            return row_count * row['runtime_per_row']
        
        total_runtimes = {sorting_column: 0 for sorting_column in sorting_columns}
        
        scans_per_query = self.table_scans.sort_values(['INPUT_ROWS'], ascending=False).groupby(['QUERY_HASH', 'OPERATOR_POINTER'])
        for _, scans in scans_per_query:
            number_of_scans = len(scans)
            assert number_of_scans > 0 and number_of_scans < 25, f"weird scan length: {number_of_scans}\nScans:\n{scans}"
            # TODO: kinda unrealistic assumption: everything not in the table scan result can be pruned
                          
            unprunable_parts = scans.apply(compute_unprunable_parts, axis=1, args=(split_factors,))            
            unprunable_part = unprunable_parts.product()
            assert unprunable_part > 0, "no unprunable part"
            
            estimated_pruned_table_size = self.round_up_to_next_multiple(unprunable_part * self.table_size, CHUNK_SIZE)
            
            runtimes = pd.DataFrame()
            runtimes['runtime_per_row'] = scans['time_per_ir']
            runtimes['COLUMN_NAME'] = scans['COLUMN_NAME']
            # the pruned table inputs should be reflected in 'estimated_input_rows'
            runtimes['estimated_input_rows'] = scans.apply(lambda x: x['INPUT_ROWS'], axis=1)

            runtimes.iloc[0, runtimes.columns.get_loc('estimated_input_rows')] = estimated_pruned_table_size                                    
            assert runtimes['estimated_input_rows'].iloc[0] == estimated_pruned_table_size, f"value is {runtimes.iloc[0]['estimated_input_rows']}, but should be {estimated_pruned_table_size}"
            # TODO modify input sizes of subsequent scans
            
            for sorting_column in sorting_columns:                
                total_runtimes[sorting_column] += runtimes.apply(compute_runtimes, axis=1, args=(sorting_column,)).sum()            
        
        clusterings = [[list(zip(clustering_columns, split_factors)), sorting_column, np.int64(total_runtimes[sorting_column])] for sorting_column in sorting_columns]
        return clusterings
    
    def determine_split_factors(self, clustering_columns):
        approximate_split_factor = self.table_size / self.target_chunksize
        individual_distinct_values = [self.distinct_values[column] for column in clustering_columns]        
        log_distinct_values = [math.ceil(0.5+np.log2(x)) for x in individual_distinct_values]
        log_distinct_values_product = reduce(operator.mul, log_distinct_values, 1)
        assert log_distinct_values_product > 0, "cannot have a distinct value count of 0"
        
        global_modification_factor = approximate_split_factor / log_distinct_values_product
        num_dimensions = len(clustering_columns)
        individual_modification_factor = np.power(global_modification_factor, 1.0 / num_dimensions)    
        split_factors = [math.ceil(x * individual_modification_factor) for x in log_distinct_values]
        
        # testing
        actual_split_factor = reduce(operator.mul, split_factors, 1)
        assert actual_split_factor > 0, "there was a split up factor of 0"
        estimated_chunksize = self.table_size / actual_split_factor
        assert estimated_chunksize <= self.target_chunksize, "chunks should be smaller, not larger than target_chunksize"
        allowed_percentage = 0.55
        if estimated_chunksize < allowed_percentage * self.target_chunksize:
            print(f"Warning: chunks should not be too much smaller than target_chunksize: {estimated_chunksize} < {allowed_percentage} * {self.target_chunksize}")
        #assert estimated_chunksize >= allowed_percentage * self.target_chunksize, f"chunks should not be too much smaller than target_chunksize: {estimated_chunksize} < {allowed_percentage} * {self.target_chunksize}"
        
        return split_factors
        
        
    
    

In [12]:
assert_correct_statistics_loaded()

def extract_single_table(table_scans, table_name):
    return table_scans[table_scans['TABLE_NAME'] == table_name]

def get_table_names(table_scans):
    return table_scans['TABLE_NAME'].unique()



def default_benchmark_config():    
    if BENCHMARK == "TPCH":
        config = {
            'lineitem': [['l_shipdate', 240 * SCALE_FACTOR + 1]],
            'orders': [['o_orderdate', 60 * SCALE_FACTOR]]
        }
    elif BENCHMARK == "TPCDS":
        config = dict()
    else:        
        raise Exception("unknown benchmark, please provide a default config and correlation information")
    return config

def get_correlations():
    if BENCHMARK == "TPCH":
        correlations = {
            'lineitem': {
                'l_shipdate': ['l_receiptdate', 'l_commitdate'],
                'l_receiptdate': ['l_shipdate', 'l_commitdate'],
            }
        }
    elif BENCHMARK == "TPCDS":
        correlations = dict()
    else:
        raise Exception("unknown benchmark")
        
    return correlations


def format_table_clustering(clustering_config):
    # input format: List of [ [(column, split)+ ], sorting_column, runtime ]
    # output format: List of [ (column, split)+ ] - sorting column integrated if necessary
    
    assert len(clustering_config) == 3, "config should have exactly three entries: clustering columns, sort column, runtime"
    clustering_columns = clustering_config[0]
    assert len(clustering_columns) <= 2, "atm the model is at most 2-dimensional"
    #print(f"clustering columns are {clustering_columns}")
    last_clustering_column = clustering_columns[-1]
    last_clustering_column_name = last_clustering_column[0]
    #print(f"last column is {last_clustering_column_name}")
    sorting_column = clustering_config[1]
    #print(f"sort column is {sorting_column}")
    
    result = clustering_columns
    if last_clustering_column_name != sorting_column:
        result = clustering_columns + [(sorting_column, 1)]
        
    #print(f"in: {clustering_config}")
    #print(f"out: {result}")
    
    return result

def get_config_name(clustering_config):
    # Input: config-dict
    
    # List of lists. Each secondary list contains clustering information for a table
    table_configs = [clustering_config[table] for table in clustering_config]
    config_entries = [[f"{config_entry[0]}-{config_entry[1]}" for config_entry in config] for config in table_configs]
    table_entries = ["_".join(config) for config in config_entries]
    return "_".join(table_entries)


def create_benchmark_configs():
    clusterings = {"default" : default_benchmark_config()}
    
    distinct_values = get_distinct_values_count()
    correlations = get_correlations()
    table_names = get_table_names(scans)
    for table_name in table_names:
        single_table = extract_single_table(scans, table_name)
        table_size = table_sizes[table_name]
        if table_size <= 5 * CHUNK_SIZE:
            continue

        model = SingleTableMdcModel(single_table, table_size, distinct_values[table_name], CHUNK_SIZE, correlations.get(table_name, {}))        
        table_clusterings = model.suggest_clustering(3)
        for table_clustering in table_clusterings:
            config = default_benchmark_config()
            config[table_name] = format_table_clustering(table_clustering)
            config_name = get_config_name(config)
            clusterings[config_name] = config

    return clusterings

create_benchmark_configs()

{'default': {'lineitem': [['l_shipdate', 241]],
  'orders': [['o_orderdate', 60]]},
 'l_shipdate-241_o_orderdate-60_c_mktsegment-6_c_phone-1': {'lineitem': [['l_shipdate',
    241]],
  'orders': [['o_orderdate', 60]],
  'customer': [('c_mktsegment', 6), ('c_phone', 1)]},
 'l_shipdate-241_o_orderdate-60_c_phone-6': {'lineitem': [['l_shipdate', 241]],
  'orders': [['o_orderdate', 60]],
  'customer': [('c_phone', 6)]},
 'l_shipdate-241_o_orderdate-60_c_mktsegment-1_c_phone-6': {'lineitem': [['l_shipdate',
    241]],
  'orders': [['o_orderdate', 60]],
  'customer': [('c_mktsegment', 1), ('c_phone', 6)]},
 'l_shipdate-241_o_orderdate-16_o_orderstatus-4_o_comment-1': {'lineitem': [['l_shipdate',
    241]],
  'orders': [('o_orderdate', 16), ('o_orderstatus', 4), ('o_comment', 1)]},
 'l_shipdate-241_o_orderdate-60_o_comment-1': {'lineitem': [['l_shipdate',
    241]],
  'orders': [('o_orderdate', 60), ('o_comment', 1)]},
 'l_shipdate-241_o_comment-11_o_orderdate-6_o_comment-1': {'lineitem': [['

In [13]:
scans['time_per_ir'].max()

9219.8

Outdated code fragments (older model versions) are kept below.

In [14]:
GAIN_COLUMN = 'runtime_gain'

scans_groupby_columnname = scans.groupby(['TABLE_NAME', 'COLUMN_NAME'])
sum_of_gains = pd.DataFrame(scans_groupby_columnname[GAIN_COLUMN].sum())
sum_of_gains.sort_values(by=['TABLE_NAME', GAIN_COLUMN], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,runtime_gain
TABLE_NAME,COLUMN_NAME,Unnamed: 2_level_1
customer,c_phone,77516210.0
customer,c_mktsegment,3304774.0
lineitem,l_shipdate,399299700.0
lineitem,l_receiptdate,139880100.0
lineitem,l_shipmode,85379750.0
lineitem,l_discount,51147480.0
lineitem,l_quantity,15444470.0
nation,n_name,-570420000.0
orders,o_orderdate,132815900.0
orders,o_orderstatus,25226290.0


In [15]:
assert_correct_statistics_loaded()

if BENCHMARK == "TPCH":
    TABLE = "lineitem"
else:    
    TABLE = "customer_demographics"

import itertools

def extract_single_table(table_name):
    return scans[scans['TABLE_NAME'] == table_name]

def extract_interesting_columns(df):
    return list(df['COLUMN_NAME'].unique())


correlations = {
    'l_shipdate': ['l_receiptdate', 'l_commitdate'],
    'l_receiptdate': ['l_shipdate', 'l_commitdate'],
    'l_commitdate': ['l_receiptdate', 'l_shipdate']
}
#correlations = {}
def table_sorting_options(table_name):
    single_table = extract_single_table(table_name)
    interesting_cols = extract_interesting_columns(single_table)
    pairs = itertools.product(interesting_cols, interesting_cols)
    
    total_times = []
    for pair in pairs:
        pruning_col = pair[0]
        sorted_col = pair[1]

        def compute_runtime(row):
            col_name = row['COLUMN_NAME']
            if pruning_col == sorted_col:
                if col_name == pruning_col:
                    return row['optimal_log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

            else:
                if col_name == pruning_col:
                    return row['optimal_runtime']
                elif col_name == sorted_col:
                    # TODO: should this be affected by correlation?
                    # we will get less chunks, so a linear scan should be close to optimal_runtime,
                    # but log time should beat it anyway
                    return row['log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

        effective_runtime = single_table.apply(compute_runtime, axis=1)
        total_times.append([pair, effective_runtime.sum()])    
    total_times = pd.DataFrame(total_times, columns=['columns', 'time'])    
    return total_times

options = table_sorting_options(TABLE)
options.sort_values(by=['time'], ascending=True)

Unnamed: 0,columns,time
20,"(l_receiptdate, l_shipdate)",229063100.0
0,"(l_shipdate, l_shipdate)",234160800.0
15,"(l_shipmode, l_shipdate)",283563500.0
10,"(l_discount, l_shipdate)",317795800.0
5,"(l_quantity, l_shipdate)",353498800.0
3,"(l_shipdate, l_shipmode)",549887200.0
2,"(l_shipdate, l_discount)",586393400.0
1,"(l_shipdate, l_quantity)",616348500.0
4,"(l_shipdate, l_receiptdate)",621480100.0
23,"(l_receiptdate, l_shipmode)",628370700.0


In [16]:
aggregates = pd.read_csv(f"{STATISTICS_PATH}/aggregates.csv", sep=',')

# it looks like column names are mixed up.
# COLUMN_NAME -> actually GROUP_BY_COLUMN_COUNT
# GROUP_BY_COLUMN_COUNT -> actually AGGREGATE_COLUMN_COUNT
# AGGREGATE_COLUMN_COUNT -> actually COLUMN_NAME

COL_NAME = 'AGGREGATE_COLUMN_COUNT'
GROUPBY_COL = 'COLUMN_NAME'
AGG_COL = 'GROUP_BY_COLUMN_COUNT'

# All aggregates have to read the entire table, so we cannot skip chunks.
# But getting all groups consecutive could provide a speedup
# As a result, we care only about aggregates with group by columns

interesting_aggregates = aggregates[aggregates[GROUPBY_COL] > 0]
stats = interesting_aggregates.groupby(['TABLE_NAME', COL_NAME])
out_columns = pd.DataFrame(stats['OUTPUT_ROWS'].max())
out_columns.sort_values(by=['TABLE_NAME', 'OUTPUT_ROWS'], ascending=[True, False])
aggregates[aggregates['COLUMN_TYPE'] == 'DATA']

Unnamed: 0,QUERY_HASH,AGGREGATE_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,GROUP_BY_COLUMN_COUNT,AGGREGATE_COLUMN_COUNT,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION
