# TuK1 - Exercise: Phyiscal Optimization
## Compression Selection

In [1]:
import enum
import pickle
import numpy as np
import pandas as pd

class EncodingType(enum.Enum):
    DictionaryFSBA = 0
    DictionarySIMDBP128 = 1
    FrameOfReferenceFSBA = 2
    FrameOfReferenceSIMDBP128 = 3
    FixedStringDictionaryFSBA = 4
    FixedStringDictionarySIMDBP128 = 5
    Unencoded = 6
    RunLength = 7
    LZ4SIMDBP128 = 8

In [2]:
with open('runtimes.pickle', 'rb') as input:
    deserialized_runtimes = pickle.load(input)

with open('sizes.pickle', 'rb') as input:
    deserialized_sizes = pickle.load(input)

# Both structures are three-dimensional numpy arrays.
# To obtain the size of table_id 5, column 1, and LZ4 Encoding (see above) use deserialized_sizes[5, 1, 8]
assert np.shape(deserialized_runtimes) == np.shape(deserialized_sizes)

In [3]:
tables = pd.read_csv('table_meta_data.csv')
tables

Unnamed: 0,TABLE_ID,TABLE_NAME,ROW_COUNT,MAX_CHUNK_SIZE
0,0,customer,150000,100000
1,1,lineitem,6001215,100000
2,2,nation,25,100000
3,3,orders,1500000,100000
4,4,part,200000,100000
5,5,partsupp,800000,100000
6,6,region,5,100000
7,7,supplier,10000,100000


In [4]:
attributes = pd.read_csv('attribute_meta_data.csv')
attributes

Unnamed: 0,ATTRIBUTE_ID,TABLE_NAME,COLUMN_NAME,DATA_TYPE,DISTINCT_VALUE_COUNT,IS_NULLABLE
0,0_0,customer,c_custkey,int,150000,False
1,0_1,customer,c_name,string,150000,False
2,0_2,customer,c_address,string,150000,False
3,0_3,customer,c_nationkey,int,25,False
4,0_4,customer,c_phone,string,150000,False
...,...,...,...,...,...,...
56,7_2,supplier,s_address,string,10000,False
57,7_3,supplier,s_nationkey,int,25,False
58,7_4,supplier,s_phone,string,10000,False
59,7_5,supplier,s_acctbal,float,9955,False


### Let's take a look at the options for the `l_shipdate` column of the `lineitem` table

In [5]:
lineitem_id = tables.query('TABLE_NAME == "lineitem"').iloc[0]['TABLE_ID']
shipdate = attributes.query('COLUMN_NAME == "l_shipdate"').iloc[0]['ATTRIBUTE_ID']
shipdate_id = int(str(shipdate).split('_')[1])
print(shipdate_id)
for encoding in EncodingType:
    runtime = deserialized_runtimes[lineitem_id, shipdate_id, encoding.value]
    size = deserialized_sizes[lineitem_id, shipdate_id, encoding.value]
    if size < np.finfo(np.float64).max:
        print(f'Encoding {encoding: <44} >> runtime: {runtime: >15,.2f}', end='')
        print(f'\tsize: {size: >15,.2f}')
    else:
        print(f'\tLeft out {encoding} as it apparently not supporting the data type of the requested column.')
        print('\tThis is marked (admittedly not very nice) with 0.0 runtimes and a size of np.finfo(np.float64).max bytes')

10
Encoding EncodingType.DictionaryFSBA                  >> runtime:        1,335.04	size:      146,729.13
Encoding EncodingType.DictionarySIMDBP128             >> runtime:       24,452.76	size:      116,592.49
	Left out EncodingType.FrameOfReferenceFSBA as it apparently not supporting the data type of the requested column.
	This is marked (admittedly not very nice) with 0.0 runtimes and a size of np.finfo(np.float64).max bytes
	Left out EncodingType.FrameOfReferenceSIMDBP128 as it apparently not supporting the data type of the requested column.
	This is marked (admittedly not very nice) with 0.0 runtimes and a size of np.finfo(np.float64).max bytes
Encoding EncodingType.FixedStringDictionaryFSBA       >> runtime:          349.11	size:      138,721.16
Encoding EncodingType.FixedStringDictionarySIMDBP128  >> runtime:       15,711.15	size:      108,546.12
Encoding EncodingType.Unencoded                       >> runtime:        4,604.35	size:    3,200,168.00
Encoding EncodingType.RunLengt

In [6]:
deserialized_runtimes


array([[[2.83876726e+02, 7.36556745e+02, 6.38298148e+02, ...,
         4.25807111e+02, 4.36194302e+02, 3.61068933e+03],
        [3.10553082e+01, 1.89906038e+02, 0.00000000e+00, ...,
         1.54406480e+04, 1.69843212e+04, 2.93114848e+06],
        [4.14598925e+02, 2.78730004e+03, 0.00000000e+00, ...,
         2.39168497e+05, 2.63137761e+05, 4.56215356e+07],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],

       [[4.54822218e+02, 1.22094423e+03, 1.00611359e+03, ...,
         6.85267221e+02, 8.38319493e+02, 7.69692132e+03],
        [1.14385957e+02, 3.15203577e+02, 1.93101313e+02, ...,
         1.62516111e+02, 1.10415956e+03, 1.61598835e+04],
        [8.60730261e+02, 