# Exercise 2: Parallel ETL

In [None]:
%load_ext sql
from time import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd
import boto3

In [5]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY=config.get('AWS','key')
SECRET= config.get('AWS','secret')

DWH_DB= config.get("DWH","DWH_DB")
DWH_DB_USER= config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD= config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT = config.get("DWH","DWH_PORT")

In [6]:
DWH_ENDPOINT="udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com" 
DWH_ROLE_ARN="arn:aws:iam::038345561986:role/RedshiftS3Access"

### Connect to the Redshift Cluster

In [7]:
conn_string=f"postgresql://{DWH_DB_USER}:{DWH_DB_PASSWORD}@{DWH_ENDPOINT}:{DWH_PORT}/{DWH_DB}"
print(conn_string)
%sql $conn_string

postgresql://admin:Udacitynanodedegree12@udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh


In [8]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

sampleDbBucket =  s3.Bucket("udacity-labs")

for obj in sampleDbBucket.objects.filter(Prefix="tickets"):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/full.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00000-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00001-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00002-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00003-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00004-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00005-d33afb94-b8af-407d-abd5-

### Create Tables

In [9]:
%%sql 
DROP TABLE IF EXISTS "sporting_event_ticket";
CREATE TABLE "sporting_event_ticket" (
    "id" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" double precision NOT NULL,
    "sport_location_id" double precision NOT NULL,
    "seat_level" numeric(1,0) NOT NULL,
    "seat_section" character varying(15) NOT NULL,
    "seat_row" character varying(10) NOT NULL,
    "seat" character varying(10) NOT NULL,
    "ticketholder_id" double precision,
    "ticket_price" numeric(8,2) NOT NULL
);

 * postgresql://admin:***@udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh
Done.
Done.


[]

### Load Partitioned data into the cluster

In [10]:
%%time
qry = f"""
       copy sporting_event_ticket from 's3://udacity-labs/tickets/split/part'
       credentials 'aws_iam_role={DWH_ROLE_ARN}'
       gzip delimiter ';' compupdate off region 'us-west-2';
       """
%sql $qry

 * postgresql://admin:***@udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh
Done.
Wall time: 13.7 s


[]

### Create Tables for the non-partitioned data

In [13]:
%%sql
DROP TABLE IF EXISTS "sporting_event_ticket_full";
CREATE TABLE "sporting_event_ticket_full" (
    "id" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" double precision NOT NULL,
    "sport_location_id" double precision NOT NULL,
    "seat_level" numeric(1,0) NOT NULL,
    "seat_section" character varying(15) NOT NULL,
    "seat_row" character varying(10) NOT NULL,
    "seat" character varying(10) NOT NULL,
    "ticketholder_id" double precision,
    "ticket_price" numeric(8,2) NOT NULL
);

 * postgresql://admin:***@udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh
Done.
Done.


[]

### Load non-partitioned data into the cluster
- Note how it's slower than loading partitioned data

In [14]:
%%time
qry = f"""
       copy sporting_event_ticket_full from 's3://udacity-labs/tickets/full/full.csv.gz' 
       credentials 'aws_iam_role={DWH_ROLE_ARN}' 
       gzip delimiter ';' compupdate off region 'us-west-2';
       """
%sql $qry

 * postgresql://admin:***@udacity-demo-cluster.ceff7eol3zgd.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh
Done.
Wall time: 25 s


[]