## Create an external table 
- external means data stored in S3
- external database means in glue catalog
- external table means table in glue catalog

In [None]:
DROP SCHEMA IF EXISTS spectrum;

In [None]:
CREATE external schema spectrum
FROM data catalog
DATABASE 'default'
IAM_ROLE 'arn:aws:iam::111222333444:role/RedshiftAssociateIAMRoleForDataEngineer'
CREATE EXTERNAL DATABASE IF NOT EXISTS;

In [None]:
DROP TABLE IF EXISTS spectrum.sales;

In [None]:
CREATE EXTERNAL TABLE spectrum.sales(
    salesid INTEGER,
    listid INTEGER,
    sellerid INTEGER,
    buyerid INTEGER,
    eventid INTEGER,
    dateid SMALLINT,
    qtysold SMALLINT,
    pricepaid DECIMAL(8,2),
    commission DECIMAL(8,2),
    saletime TIMESTAMP
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales/'
TABLE PROPERTIES ('numRows'='172000')

In [None]:
SELECT * FROM spectrum.sales LIMIT 1000;

In [None]:
SELECT SUM(pricepaid)
FROM spectrum.sales
WHERE saletime::date = '2008-06-26'

## Join external table with a local table

In [None]:
SELECT * FROM event LIMIT 10

In [None]:
SELECT TOP 10
  spectrum.sales.eventid,
  SUM(spectrum.sales.pricepaid)
FROM spectrum.sales, event
WHERE spectrum.sales.eventid = event.eventid
  AND spectrum.sales.pricepaid > 30
GROUP BY spectrum.sales.eventid
ORDER BY 2 DESC

In [None]:
EXPLAIN
SELECT TOP 10
    spectrum.sales.eventid,
    sum(spectrum.sales.pricepaid)
FROM spectrum.sales, event
WHERE spectrum.sales.eventid = event.eventid
  AND spectrum.sales.pricepaid > 30
GROUP BY spectrum.sales.eventid
ORDER BY 2 DESC

## Partitioned 

Redshift Spectrum must be informed of the existing partitions so that it knows which directories to use 

PRE saledate=2008-01/ \
PRE saledate=2008-02/ \
PRE saledate=2008-03/ \
PRE saledate=2008-04/ \
PRE saledate=2008-05/ \
PRE saledate=2008-06/ \
PRE saledate=2008-07/ \
PRE saledate=2008-08/ \
PRE saledate=2008-09/ \
PRE saledate=2008-10/ \
PRE saledate=2008-11/ \
PRE saledate=2008-12/

In [None]:
DROP TABLE IF EXISTS spectrum.sales_partitioned;

In [None]:
CREATE EXTERNAL TABLE spectrum.sales_partitioned(
    salesid INTEGER,
    listid INTEGER,
    sellerid INTEGER,
    buyerid INTEGER,
    eventid INTEGER,
    dateid SMALLINT,
    qtysold SMALLINT,
    pricepaid DECIMAL(8,2),
    commission DECIMAL(8,2),
    saletime TIMESTAMP
)
PARTITIONED BY (saledate DATE)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/'
TABLE PROPERTIES ('numRows'='172000')

## Upldate Partition Information for Redshift

In [None]:
ALTER TABLE spectrum.sales_partitioned ADD if not exists
PARTITION(saledate='2008-01-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-01/'
PARTITION(saledate='2008-02-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-02/'
PARTITION(saledate='2008-03-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-03/'
PARTITION(saledate='2008-04-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-04/'
PARTITION(saledate='2008-05-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-05/'
PARTITION(saledate='2008-06-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-06/'
PARTITION(saledate='2008-07-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-07/'
PARTITION(saledate='2008-08-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-08/'
PARTITION(saledate='2008-09-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-09/'
PARTITION(saledate='2008-10-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-10/'
PARTITION(saledate='2008-11-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-11/'
PARTITION(saledate='2008-12-01')
LOCATION 's3://lake-entest-demo-002/tickit/spectrum/sales_partition/saledate=2008-12/';

In [None]:
SELECT TOP 10
    spectrum.sales.eventid,
    SUM(pricepaid)
FROM spectrum.sales, event
WHERE spectrum.sales.eventid = event.eventid
  AND pricepaid > 30
  AND date_trunc('month', saletime) = '2008-12-01'
GROUP BY spectrum.sales.eventid
ORDER BY 2 DESC

In [None]:
SELECT TOP 10
    spectrum.sales_partitioned.eventid,
    SUM(pricepaid)
FROM spectrum.sales_partitioned, event
WHERE spectrum.sales_partitioned.eventid = event.eventid
  AND pricepaid > 30
  AND saledate = '2008-12-01'
GROUP BY spectrum.sales_partitioned.eventid
ORDER BY 2 DESC