## Dataset 

- [tpch-3tb](https://github.com/awslabs/amazon-redshift-utils/blob/master/src/CloudDataWarehouseBenchmark/Cloud-DWB-Derived-from-TPCH/3TB/ddl.sql)
- [amazon redshift load sample data](https://docs.aws.amazon.com/redshift/latest/mgmt/query-editor-v2-loading.html)
- [amazon redshift workshop schema](https://catalog.us-east-1.prod.workshops.aws/workshops/380e0b8a-5d4c-46e3-95a8-82d68cf5789a/en-US/gettingstarted/lab2)

select count(*) from customer;  -- 450000000 (450M) \
select count(*) from lineitem;  -- 18000048306 (18B) \
select count(*) from nation;  -- 25 \
select count(*) from orders;  -- 4500000000 (4.5B) \
select count(*) from part;  -- 600000000 (600M) \
select count(*) from partsupp;  -- 2400000000 (2.4B) \
select count(*) from region;  -- 5 \
select count(*) from supplier;  -- 30000000 (30M) 

Size of lineitem: 2.2TB 

```bash 
aws s3 ls --summarize --human-readable --recursive s3://redshift-downloads/TPC-H/2.18/3TB/lineitem/
```

Size of customer: 64GB

```bash
aws s3 ls --summarize --human-readable --recursive s3://redshift-downloads/TPC-H/2.18/3TB/customer/
```

In [0]:
set search_path to public;
show search_path;

## Create Tables

In [0]:
DROP TABLE IF EXISTS partsupp;
DROP TABLE IF EXISTS lineitem;
DROP TABLE IF EXISTS supplier;
DROP TABLE IF EXISTS part;
DROP TABLE IF EXISTS orders;
DROP TABLE IF EXISTS customer;
DROP TABLE IF EXISTS nation;
DROP TABLE IF EXISTS region;

create table customer (
  c_custkey int8 not null ,
  c_name varchar(25) not null,
  c_address varchar(40) not null,
  c_nationkey int4 not null,
  c_phone char(15) not null,
  c_acctbal numeric(12,2) not null,
  c_mktsegment char(10) not null,
  c_comment varchar(117) not null,
  Primary Key(C_CUSTKEY)
) distkey(c_custkey) sortkey(c_custkey);

create table lineitem (
  l_orderkey int8 not null ,
  l_partkey int8 not null,
  l_suppkey int4 not null,
  l_linenumber int4 not null,
  l_quantity numeric(12,2) not null,
  l_extendedprice numeric(12,2) not null,
  l_discount numeric(12,2) not null,
  l_tax numeric(12,2) not null,
  l_returnflag char(1) not null,
  l_linestatus char(1) not null,
  l_shipdate date not null ,
  l_commitdate date not null,
  l_receiptdate date not null,
  l_shipinstruct char(25) not null,
  l_shipmode char(10) not null,
  l_comment varchar(44) not null,
  Primary Key(L_ORDERKEY, L_LINENUMBER)
) distkey(l_orderkey) sortkey(l_shipdate,l_orderkey);

create table nation (
  n_nationkey int4 not null,
  n_name char(25) not null ,
  n_regionkey int4 not null,
  n_comment varchar(152) not null,
  Primary Key(N_NATIONKEY)                                
) distkey(n_nationkey) sortkey(n_nationkey) ;

create table orders (
  o_orderkey int8 not null,
  o_custkey int8 not null,
  o_orderstatus char(1) not null,
  o_totalprice numeric(12,2) not null,
  o_orderdate date not null,
  o_orderpriority char(15) not null,
  o_clerk char(15) not null,
  o_shippriority int4 not null,
  o_comment varchar(79) not null,
  Primary Key(O_ORDERKEY)
) distkey(o_orderkey) sortkey(o_orderdate, o_orderkey);

create table part (
  p_partkey int8 not null ,
  p_name varchar(55) not null,
  p_mfgr char(25) not null,
  p_brand char(10) not null,
  p_type varchar(25) not null,
  p_size int4 not null,
  p_container char(10) not null,
  p_retailprice numeric(12,2) not null,
  p_comment varchar(23) not null,
  PRIMARY KEY (P_PARTKEY)
) distkey(p_partkey) sortkey(p_partkey);

create table partsupp (
  ps_partkey int8 not null,
  ps_suppkey int4 not null,
  ps_availqty int4 not null,
  ps_supplycost numeric(12,2) not null,
  ps_comment varchar(199) not null,
  Primary Key(PS_PARTKEY, PS_SUPPKEY)
) distkey(ps_partkey) sortkey(ps_partkey);

create table region (
  r_regionkey int4 not null,
  r_name char(25) not null ,
  r_comment varchar(152) not null,
  Primary Key(R_REGIONKEY)                             
) distkey(r_regionkey) sortkey(r_regionkey);

create table supplier (
  s_suppkey int4 not null,
  s_name char(25) not null,
  s_address varchar(40) not null,
  s_nationkey int4 not null,
  s_phone char(15) not null,
  s_acctbal numeric(12,2) not null,
  s_comment varchar(101) not null,
  Primary Key(S_SUPPKEY)
) distkey(s_suppkey) sortkey(s_suppkey);

## COPY Data From S3

In [0]:
copy region from 's3://redshift-downloads/TPC-H/2.18/3TB/region/' iam_role default delimiter '|' region 'us-east-1';
copy nation from 's3://redshift-downloads/TPC-H/2.18/3TB/nation/' iam_role default delimiter '|' region 'us-east-1';
copy lineitem from 's3://redshift-downloads/TPC-H/2.18/3TB/lineitem/' iam_role default delimiter '|' region 'us-east-1';
copy orders from 's3://redshift-downloads/TPC-H/2.18/3TB/orders/' iam_role default delimiter '|' region 'us-east-1';
copy part from 's3://redshift-downloads/TPC-H/2.18/3TB/part/' iam_role default delimiter '|' region 'us-east-1';
copy supplier from 's3://redshift-downloads/TPC-H/2.18/3TB/supplier/' iam_role default delimiter '|' region 'us-east-1';
copy partsupp from 's3://redshift-downloads/TPC-H/2.18/3TB/partsupp/' iam_role default delimiter '|' region 'us-east-1';
copy customer from 's3://redshift-downloads/TPC-H/2.18/3TB/customer/' iam_role default delimiter '|' region 'us-east-1';

In [0]:
select count(*) from lineitem;