In [5]:
#STATISTICS_PATH = 'TPC-H__SF_1.000000__RUNS_10'
STATISTICS_PATH = 'TPC-H__SF_1.000000__RUNS_10_originalscans'
#STATISTICS_PATH = 'TPC-H__SF_1.000000__RUNS_10_unsort'    # this one is actually unsorted
#STATISTICS_PATH = 'TPC-H__SF_1.000000__RUNS_10_unsorted' # this one is actually default-sorted
#STATISTICS_PATH = 'TPC-DS__SF_1.000000__RUNS_1'

CHUNK_SIZE = 100000

import pandas as pd
import numpy as np

In [6]:
scans = pd.read_csv(f"{STATISTICS_PATH}/table_scans.csv", sep='|')

# If there is only one chunk, we cannot really prune
scans = scans[scans['INPUT_ROWS'] > CHUNK_SIZE]


def round_up_to_chunksize(row):
    if row['OUTPUT_ROWS'] % CHUNK_SIZE == 0:
        return row['OUTPUT_ROWS']
    else:
        return row['OUTPUT_ROWS'] + (CHUNK_SIZE - (row['OUTPUT_ROWS'] % CHUNK_SIZE))

scans['pruned_minimum_input_rows'] = scans.apply(round_up_to_chunksize, axis=1)


# do we care about reference segments, or data only?


# how much do we gain if we prune as much as possible? (per column, 1d)
scans['gain'] = scans['INPUT_ROWS'] - scans['pruned_minimum_input_rows']

scans['selectivity'] = scans['OUTPUT_ROWS'] / scans['INPUT_ROWS']
scans['actual_selectivity'] = scans['SINGLE_OUTPUT_ROWS'] / scans['SINGLE_INPUT_ROWS']

scans['time_per_ir'] = scans['INPUT_ROWS'] / scans['RUNTIME_NS']
scans['time_per_or'] = scans['OUTPUT_ROWS'] / scans['RUNTIME_NS']

# optimal runtime assuming perfect pruning, but not sortedness
scans['optimal_runtime'] = scans['time_per_ir'] * scans['pruned_minimum_input_rows']
scans['runtime_gain'] = scans['RUNTIME_NS'] - scans['optimal_runtime']


# log runtime for sorted columns
scans['log_runtime'] = np.log2(scans['RUNTIME_NS'])
scans['optimal_log_runtime'] = np.log2(1+scans['optimal_runtime'])

scans

Unnamed: 0,QUERY_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION,SINGLE_INPUT_ROWS,SINGLE_OUTPUT_ROWS,pruned_minimum_input_rows,gain,selectivity,actual_selectivity,time_per_ir,time_per_or,optimal_runtime,runtime_gain,log_runtime,optimal_log_runtime
7,6730c267d3eac48a,DATA,orders,o_orderstatus,1500000,729413,8571255,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.175004,0.085100,140002.835057,8.431252e+06,23.031075,17.095107
20,6ec3126b032024be,DATA,orders,o_orderstatus,1500000,729413,8631811,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.173776,0.084503,139020.652792,8.492790e+06,23.041232,17.084950
33,7324393c05ab5301,DATA,orders,o_orderstatus,1500000,729413,8730991,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.171802,0.083543,137441.442787,8.593550e+06,23.057714,17.068468
46,37e2ba0a1c4e865f,DATA,orders,o_orderstatus,1500000,729413,8556796,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.175299,0.085244,140239.407367,8.416557e+06,23.028639,17.097543
59,a17cb368eadced8f,DATA,orders,o_orderstatus,1500000,729413,9111721,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.164623,0.080052,131698.501304,8.980022e+06,23.119292,17.006890
72,bfb403aee0d212a,DATA,orders,o_orderstatus,1500000,729413,8816203,TableScan Impl: ColumnVsValue o_orderstatus = 'F',1500000,729413,800000,700000,0.486275,0.486275,0.170141,0.082736,136113.018269,8.680090e+06,23.071726,17.054456
85,98aa70b345defa5b,DATA,part,p_name,200000,2233,439679,TableScan Impl: ColumnBetween p_name BETWEEN U...,200000,2233,100000,100000,0.011165,0.011165,0.454877,0.005079,45487.730822,3.941913e+05,18.746091,15.473222
86,98aa70b345defa5b,DATA,part,p_name,200000,2233,439679,TableScan Impl: ColumnBetween p_name BETWEEN U...,200000,2233,100000,100000,0.011165,0.011165,0.454877,0.005079,45487.730822,3.941913e+05,18.746091,15.473222
87,98aa70b345defa5b,DATA,part,p_name,200000,2233,439679,TableScan Impl: ColumnBetween p_name BETWEEN U...,200000,2233,100000,100000,0.011165,0.011165,0.454877,0.005079,45487.730822,3.941913e+05,18.746091,15.473222
89,98aa70b345defa5b,DATA,part,p_name,200000,2233,439679,TableScan Impl: ColumnBetween p_name BETWEEN U...,200000,2233,100000,100000,0.011165,0.011165,0.454877,0.005079,45487.730822,3.941913e+05,18.746091,15.473222


In [7]:
GAIN_COLUMN = 'runtime_gain'

scans_groupby_columnname = scans.groupby(['TABLE_NAME', 'COLUMN_NAME'])
sum_of_gains = pd.DataFrame(scans_groupby_columnname[GAIN_COLUMN].sum())
sum_of_gains.sort_values(by=['TABLE_NAME', GAIN_COLUMN], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,runtime_gain
TABLE_NAME,COLUMN_NAME,Unnamed: 2_level_1
customer,c_mktsegment,9565916.0
lineitem,l_shipdate,1029285000.0
lineitem,l_receiptdate,609466800.0
lineitem,l_shipmode,226148500.0
lineitem,l_discount,72679630.0
lineitem,l_returnflag,67217460.0
lineitem,l_quantity,35944440.0
orders,o_comment,1610368000.0
orders,o_orderdate,358589400.0
orders,o_orderstatus,51594260.0


In [9]:
TABLE = "lineitem"
#TABLE = "customer_demographics"

import itertools

def extract_single_table(table_name):
    return scans[scans['TABLE_NAME'] == table_name]

def extract_interesting_columns(df):
    return list(df['COLUMN_NAME'].unique())


correlations = {
    'l_shipdate': ['l_receiptdate', 'l_commitdate'],
    'l_receiptdate': ['l_shipdate', 'l_commitdate'],
    'l_commitdate': ['l_receiptdate', 'l_shipdate']
}
#correlations = {}
def table_sorting_options(table_name):
    single_table = extract_single_table(table_name)
    interesting_cols = extract_interesting_columns(single_table)
    pairs = itertools.product(interesting_cols, interesting_cols)
    
    total_times = []
    for pair in pairs:
        pruning_col = pair[0]
        sorted_col = pair[1]

        def compute_runtime(row):
            col_name = row['COLUMN_NAME']
            if pruning_col == sorted_col:
                if col_name == pruning_col:
                    return row['optimal_log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

            else:
                if col_name == pruning_col:
                    return row['optimal_runtime']
                elif col_name == sorted_col:
                    # TODO: should this be affected by correlation?
                    # we will get less chunks, so a linear scan should be close to optimal_runtime,
                    # but log time should beat it anyway
                    return row['log_runtime']
                else:
                    if col_name in correlations.get(pruning_col, []):
                        # correlated to pruning column -> a lot of pruning, no sortedness
                        # TODO: better measure correlation
                        return 1.2 * row['optimal_runtime']
                    else:
                        return row['RUNTIME_NS']

        effective_runtime = single_table.apply(compute_runtime, axis=1)
        total_times.append([pair, effective_runtime.sum()])    
    total_times = pd.DataFrame(total_times, columns=['columns', 'time'])    
    return total_times

options = table_sorting_options(TABLE)
options.sort_values(by=['time'], ascending=True)

Unnamed: 0,columns,time
10,"(l_shipdate, l_shipmode)",211503800.0
34,"(l_receiptdate, l_shipmode)",211792400.0
9,"(l_shipdate, l_discount)",365976600.0
33,"(l_receiptdate, l_discount)",366265100.0
6,"(l_shipdate, l_returnflag)",371675900.0
30,"(l_receiptdate, l_returnflag)",371964400.0
8,"(l_shipdate, l_quantity)",402926000.0
32,"(l_receiptdate, l_quantity)",403214500.0
31,"(l_receiptdate, l_shipdate)",419055700.0
11,"(l_shipdate, l_receiptdate)",420497800.0


In [3]:
data = scans[scans['COLUMN_TYPE'] == 'DATA']
data = data[data['INPUT_ROWS'] > 1500000]
data

# this shows whether we are logging pruned inputs

Unnamed: 0,QUERY_HASH,COLUMN_TYPE,TABLE_NAME,COLUMN_NAME,INPUT_ROWS,OUTPUT_ROWS,RUNTIME_NS,DESCRIPTION,SINGLE_INPUT_ROWS,SINGLE_OUTPUT_ROWS,pruned_minimum_input_rows,gain,selectivity,time_per_ir,time_per_or,optimal_runtime,runtime_gain,log_runtime,optimal_log_runtime
144,9762c3a887e47469,DATA,lineitem,l_shipdate,6001215,77313,10652028,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,77313,100000,5901215,0.012883,0.563387,0.007258,5.633871e+04,1.059569e+07,23.344625,15.781864
450,716441263d3331d0,DATA,lineitem,l_shipdate,6001215,77905,9864657,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,77905,100000,5901215,0.012982,0.608355,0.007897,6.083552e+04,9.803821e+06,23.233837,15.892650
671,6cf267cd7600d268,DATA,lineitem,l_shipdate,6001215,913487,16348712,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,913487,1000000,5001215,0.152217,0.367076,0.055875,3.670757e+05,1.598164e+07,23.962674,18.485722
731,45337615a2f78d4e,DATA,lineitem,l_shipdate,6001215,908721,15531517,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,908721,1000000,5001215,0.151423,0.386389,0.058508,3.863895e+05,1.514513e+07,23.888695,18.559700
734,7994976cdc25c131,DATA,lineitem,l_shipdate,6001215,909455,15227199,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,909455,1000000,5001215,0.151545,0.394112,0.059726,3.941115e+05,1.483309e+07,23.860147,18.588248
792,497648c7e308ba08,DATA,lineitem,l_shipdate,6001215,914963,15771996,TableScan Impl: ColumnBetween l_shipdate BETWE...,6001215,914963,1000000,5001215,0.152463,0.380498,0.058012,3.804981e+05,1.539150e+07,23.910862,18.537534
797,b4a9d8860da090d4,DATA,lineitem,l_receiptdate,6001215,913018,15743150,TableScan Impl: ColumnBetween l_receiptdate BE...,6001215,913018,1000000,5001215,0.152139,0.381195,0.057995,3.811953e+05,1.536195e+07,23.908221,18.540175
799,b4a9d8860da090d4,DATA,lineitem,l_receiptdate,6001215,913018,15743150,TableScan Impl: ColumnBetween l_receiptdate BE...,6001215,913018,1000000,5001215,0.152139,0.381195,0.057995,3.811953e+05,1.536195e+07,23.908221,18.540175
801,b4a9d8860da090d4,DATA,lineitem,l_receiptdate,6001215,913018,15743150,TableScan Impl: ColumnBetween l_receiptdate BE...,6001215,913018,1000000,5001215,0.152139,0.381195,0.057995,3.811953e+05,1.536195e+07,23.908221,18.540175
803,b4a9d8860da090d4,DATA,lineitem,l_receiptdate,6001215,913018,15743150,TableScan Impl: ColumnBetween l_receiptdate BE...,6001215,913018,1000000,5001215,0.152139,0.381195,0.057995,3.811953e+05,1.536195e+07,23.908221,18.540175


In [None]:
aggregates = pd.read_csv(f"{STATISTICS_PATH}/aggregates.csv", sep=',')

# it looks like column names are mixed up.
# COLUMN_NAME -> actually GROUP_BY_COLUMN_COUNT
# GROUP_BY_COLUMN_COUNT -> actually AGGREGATE_COLUMN_COUNT
# AGGREGATE_COLUMN_COUNT -> actually COLUMN_NAME

COL_NAME = 'AGGREGATE_COLUMN_COUNT'
GROUPBY_COL = 'COLUMN_NAME'
AGG_COL = 'GROUP_BY_COLUMN_COUNT'

# All aggregates have to read the entire table, so we cannot skip chunks.
# But getting all groups consecutive could provide a speedup
# As a result, we care only about aggregates with group by columns

interesting_aggregates = aggregates[aggregates[GROUPBY_COL] > 0]
stats = interesting_aggregates.groupby(['TABLE_NAME', COL_NAME])
out_columns = pd.DataFrame(stats['OUTPUT_ROWS'].max())
out_columns.sort_values(by=['TABLE_NAME', 'OUTPUT_ROWS'], ascending=[True, False])
aggregates[aggregates['COLUMN_TYPE'] == 'DATA']