# TuK1 - Exercise: Phyiscal Optimization
## Compression Selection

In [7]:
import enum
import pickle
import numpy as np
import pandas as pd

class EncodingType(enum.Enum):
    DictionaryFSBA = 0
    DictionarySIMDBP128 = 1
    FrameOfReferenceFSBA = 2
    FrameOfReferenceSIMDBP128 = 3
    FixedStringDictionaryFSBA = 4
    FixedStringDictionarySIMDBP128 = 5
    Unencoded = 6
    RunLength = 7
    LZ4SIMDBP128 = 8

In [8]:
with open('runtimes.pickle', 'rb') as input:
    deserialized_runtimes = pickle.load(input)

with open('sizes.pickle', 'rb') as input:
    deserialized_sizes = pickle.load(input)

# Both structures are three-dimensional numpy arrays.
# To obtain the size of table_id 5, column 1, and LZ4 Encoding (see above) use deserialized_sizes[5, 1, 8]
assert np.shape(deserialized_runtimes) == np.shape(deserialized_sizes)

In [9]:
tables = pd.read_csv('table_meta_data.csv')
tables

Unnamed: 0,TABLE_ID,TABLE_NAME,ROW_COUNT,MAX_CHUNK_SIZE
0,0,customer,150000,100000
1,1,lineitem,6001215,100000
2,2,nation,25,100000
3,3,orders,1500000,100000
4,4,part,200000,100000
5,5,partsupp,800000,100000
6,6,region,5,100000
7,7,supplier,10000,100000


In [10]:
attributes = pd.read_csv('attribute_meta_data.csv')
attributes

Unnamed: 0,ATTRIBUTE_ID,TABLE_NAME,COLUMN_NAME,DATA_TYPE,DISTINCT_VALUE_COUNT,IS_NULLABLE
0,0_0,customer,c_custkey,int,150000,False
1,0_1,customer,c_name,string,150000,False
2,0_2,customer,c_address,string,150000,False
3,0_3,customer,c_nationkey,int,25,False
4,0_4,customer,c_phone,string,150000,False
...,...,...,...,...,...,...
56,7_2,supplier,s_address,string,10000,False
57,7_3,supplier,s_nationkey,int,25,False
58,7_4,supplier,s_phone,string,10000,False
59,7_5,supplier,s_acctbal,float,9955,False


### Let's take a look at the options for the `l_shipdate` column of the `lineitem` table

In [13]:
lineitem_id = tables.query('TABLE_NAME == "lineitem"').iloc[0]['TABLE_ID']
shipdate = attributes.query('COLUMN_NAME == "l_shipdate"').iloc[0]['ATTRIBUTE_ID']
shipdate_id = int(str(shipdate).split('_')[1])
print(shipdate_id)
for encoding in EncodingType:
    runtime = deserialized_runtimes[lineitem_id, shipdate_id, encoding.value]
    size = deserialized_sizes[lineitem_id, shipdate_id, encoding.value]
    if size < np.finfo(np.float64).max:
        print(f'Encoding {encoding: <44} >> runtime: {runtime: >15,.2f}', end='')
        print(f'\tsize: {size: >15,.2f}')
    else:
        print(f'\tLeft out {encoding} as it apparently not supporting the data type of the requested column.')
        print('\tThis is marked (admittedly not very nice) with 0.0 runtimes and a size of np.finfo(np.float64).max bytes')