In [1]:
BENCHMARK = 'tpch'
TABLE_NAME = 'lineitem'
TABLE_PATH = f'exported_tables/{BENCHMARK}/{TABLE_NAME}.csv'
METADATA_PATH = f'{TABLE_PATH}.json'

import pandas as pd
import numpy as np
import json

In [2]:
with open(METADATA_PATH, 'r') as f:
    columns = json.load(f)['columns']
    
column_names = [x['name'] for x in columns]
date_columns = list(filter(lambda name: "date" in name, column_names))

type_matchings = {
    "int": np.int32,
    "float": np.float32,
    "string": np.object
}

dtypes = { x['name']: type_matchings.get(x['type'], f"Unknown type: {x['type']}") for x in columns}

In [3]:
table = pd.read_csv(TABLE_PATH, names=column_names, dtype=dtypes, parse_dates=date_columns, header=None, sep=',')

In [4]:
table['diff_receipt_ship'] = table['l_receiptdate'] - table['l_shipdate']
table['diff_commit_receipt'] = table['l_commitdate'] - table['l_receiptdate']
table['diff_commit_ship'] = table['l_commitdate'] - table['l_shipdate']
table

Unnamed: 0,l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment,diff_receipt_ship,diff_commit_receipt,diff_commit_ship
0,721220,177803,5355,2,19.0,35735.199219,0.08,0.03,R,F,1992-01-02,1992-02-04,1992-01-09,TAKE BACK RETURN,SHIP,. slyly even accounts,7 days,26 days,33 days
1,842980,188156,5711,4,5.0,6220.750000,0.01,0.03,A,F,1992-01-02,1992-03-20,1992-01-20,COLLECT COD,REG AIR,lly regular asymptotes. unu,18 days,60 days,78 days
2,904677,56678,1689,1,43.0,70290.796875,0.08,0.01,R,F,1992-01-02,1992-03-22,1992-01-14,COLLECT COD,AIR,fix. quickly ironic instruct,12 days,68 days,80 days
3,990147,154290,4291,1,6.0,8065.740234,0.10,0.01,R,F,1992-01-02,1992-03-01,1992-01-15,NONE,REG AIR,lyly according to the caref,13 days,46 days,59 days
4,1054181,16217,6218,1,45.0,50994.398438,0.03,0.08,R,F,1992-01-02,1992-02-05,1992-01-15,NONE,MAIL,y unusual instructions. furiously reg,13 days,21 days,34 days
5,1111877,134177,1717,3,20.0,24223.400391,0.10,0.07,A,F,1992-01-02,1992-02-28,1992-01-07,TAKE BACK RETURN,FOB,re. ideas wake,5 days,52 days,57 days
6,1332613,53982,1498,1,14.0,27103.699219,0.08,0.07,A,F,1992-01-02,1992-02-11,1992-01-18,TAKE BACK RETURN,TRUCK,y against the furiously regular,16 days,24 days,40 days
7,1552449,159307,1823,2,28.0,38256.398438,0.08,0.06,R,F,1992-01-02,1992-03-14,1992-01-05,TAKE BACK RETURN,SHIP,hely final excuses. bold accounts wake. q,3 days,69 days,72 days
8,2167527,17849,7850,3,39.0,68906.796875,0.00,0.06,R,F,1992-01-02,1992-02-18,1992-01-11,TAKE BACK RETURN,AIR,foxes sleep blithely along the idle exc,9 days,38 days,47 days
9,2184032,139420,4447,5,14.0,20431.900391,0.06,0.02,A,F,1992-01-02,1992-02-25,1992-01-15,DELIVER IN PERSON,RAIL,even ideas breach slyly above the d,13 days,41 days,54 days


In [10]:
ranges = {column_name: table[column_name].max() - table[column_name].min() for column_name in date_columns}
# +1 day

max_receipt_value = table['diff_receipt_ship'].max()
min_receipt_value = table['diff_receipt_ship'].min()
receipt_ship_diff_range = max_receipt_value - min_receipt_value # +1 day
print(f"l_receiptdate is between {min_receipt_value} and {max_receipt_value} different from l_shipdate (range of {receipt_ship_diff_range})")

max_commit_value = table['diff_commit_ship'].max()
min_commit_value = table['diff_commit_ship'].min()
commit_ship_diff_range = max_commit_value - min_commit_value # +1 day
print(f"l_commitdate is between {min_commit_value} and {max_commit_value} different from l_shipdate (range of {commit_ship_diff_range})")
print()

print(f"for a fixed l_shipdate, {100*receipt_ship_diff_range / ranges['l_receiptdate']}% of unique l_receiptdate values qualify") 
print(f"for a fixed l_shipdate, {100*commit_ship_diff_range / ranges['l_commitdate']}% of unique l_commitdate values qualify")
ranges['l_shipdate']

l_receiptdate is between 1 days 00:00:00 and 30 days 00:00:00 different from l_shipdate (range of 29 days 00:00:00)
l_commitdate is between -91 days +00:00:00 and 89 days 00:00:00 different from l_shipdate (range of 180 days 00:00:00)

for a fixed l_shipdate, 1.1359185272228751% of unique l_receiptdate values qualify
for a fixed l_shipdate, 7.302231237322515% of unique l_commitdate values qualify


Timedelta('2525 days 00:00:00')