In [54]:
%load_ext sql
import json
import configparser
import os
import boto3
from time import time

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Import variables from terraform

In [16]:
# Import variables from terraform.tvars.json
with open('../terraform/terraform.tvars.json') as f:
    data = json.load(f)
    dwh_db = data['redshift_database_name']
    dwh_db_user = data['redshift_admin_username']
    dwh_db_password = data['redshift_admin_password']

In [17]:
# Import outputs from terraform_output.json
with open('../terraform/terraform_output.json') as f:
    data = json.load(f)
    dwh_endpoint = data['cluster_endpoint']['value']
    dwh_role_arn = data['redshift_iam_arn']['value']

## Connect to Redshift Cluster

In [20]:
conn_str = f'postgresql://{dwh_db_user}:{dwh_db_password}@{dwh_endpoint}/{dwh_db}'
%sql $conn_str

'Connected: admin@redshift_dwh'

## Connect to Udacity S3 bucket

In [40]:
# Get the home path string
home = os.path.expanduser('~')

# Import AWS access key and secret key with configparser
config = configparser.ConfigParser()
config.read_file(open(home + '/.aws/credentials'))

# Import variables from default profile
AWS_ACCESS_KEY = config.get('default','aws_access_key_id')
AWS_SECRET_KEY = config.get('default','aws_secret_access_key')

In [48]:
# Connect to S3 bucket and list the resources
s3 = boto3.resource('s3', region_name='us-west-2', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
sampleDbBucket = s3.Bucket('udacity-labs')
for obj in sampleDbBucket.objects.filter(Prefix='tickets'):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/full.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00000-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00001-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00002-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00003-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00004-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00005-d33afb94-b8af-407d-abd5-

## Create tables in Redshift

In [49]:
%%sql 
DROP TABLE IF EXISTS "sporting_event_ticket";
CREATE TABLE "sporting_event_ticket" (
    "id" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" double precision NOT NULL,
    "sport_location_id" double precision NOT NULL,
    "seat_level" numeric(1,0) NOT NULL,
    "seat_section" character varying(15) NOT NULL,
    "seat_row" character varying(10) NOT NULL,
    "seat" character varying(10) NOT NULL,
    "ticketholder_id" double precision,
    "ticket_price" numeric(8,2) NOT NULL
);

 * postgresql://admin:***@redshift-cluster.cspf33xnqwsx.eu-central-1.redshift.amazonaws.com:5439/redshift_dwh
Done.
Done.


[]

## Explore how loading partitoned data is faster than bulk file
#### * here results prove otherwise, probably because of the region

In [56]:
# Load partitioned data into the cluster by getting the files with "part" prefix
# We use compupdate off to not mess the timing 

In [62]:
%%time
query = f'''
    COPY sporting_event_ticket from 's3://udacity-labs/tickets/split/part'
    credentials 'aws_iam_role={dwh_role_arn}'
    gzip delimiter ';' compupdate off region 'us-west-2';
'''
%sql $query

 * postgresql://admin:***@redshift-cluster.cspf33xnqwsx.eu-central-1.redshift.amazonaws.com:5439/redshift_dwh
Done.
CPU times: user 14.1 ms, sys: 0 ns, total: 14.1 ms
Wall time: 31.6 s


[]

In [59]:
# Load non-partition data into the cluster (single gziped file) and time the process
# First drop the existing table

In [60]:
%%sql
DROP TABLE IF EXISTS "sporting_event_ticket_full";
CREATE TABLE "sporting_event_ticket_full" (
    "id" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" double precision NOT NULL,
    "sport_location_id" double precision NOT NULL,
    "seat_level" numeric(1,0) NOT NULL,
    "seat_section" character varying(15) NOT NULL,
    "seat_row" character varying(10) NOT NULL,
    "seat" character varying(10) NOT NULL,
    "ticketholder_id" double precision,
    "ticket_price" numeric(8,2) NOT NULL
);

 * postgresql://admin:***@redshift-cluster.cspf33xnqwsx.eu-central-1.redshift.amazonaws.com:5439/redshift_dwh
Done.
Done.


[]

In [61]:
%%time
query = f'''
    COPY sporting_event_ticket from 's3://udacity-labs/tickets/full/full.csv.gz'
    credentials 'aws_iam_role={dwh_role_arn}'
    gzip delimiter ';' compupdate off region 'us-west-2';
'''
%sql $query

 * postgresql://admin:***@redshift-cluster.cspf33xnqwsx.eu-central-1.redshift.amazonaws.com:5439/redshift_dwh
Done.
CPU times: user 1.85 ms, sys: 6.74 ms, total: 8.59 ms
Wall time: 23.8 s


[]