In [5]:
## Install ipython-sql for running magical sql commands :
## %sql for single line commands and %%sql for multi line sql commands.
## We can also use the variables defined in python program along with %sql command by using $ as a prefix to the python variables.

## uncomment the pip command below to run it.

#!pip install ipython-sql



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [7]:
## Libraries

%reload_ext sql     
## I have used reload intead of load because I ran this program multiple times. Simple load command will throw error when used multiple times.
import psycopg2     ## Psycopg2 is the most popular PostgreSQL database adapter for the Python programming language.
                    ## Since redshift is also a modified postgres db, thus it is used to access redshift from outside the VPC(virtual private cloud (aws)).
import pandas as pd  
import boto3        ## Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python.
                    ## It allows Python developers to write software that makes use of services like Amazon S3 ,IAM, and  Amazon EC2 etc.
import json          
import configparser  ## This is used to programmatically access the configuration file (dwh.cfg)

In [8]:
## Access the config file

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

## Save the credentials in the form of variables

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwh-cluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


In [9]:
## Access clients and resources in aws vpc from jupyter

## Create boto3 objects for connecting to aws S3 , IAM, and redshift through boto3, on behalf of the user (created and named as dwhadmin in aws).
## It (user: dwhadmin) has associated key and secret.

## An easy way to understand the difference between the user and the client is as follows:
## Resource: Something which is going to be used by the User. S3 will be used by User, and a pseudo user redshift (redshift will read the data from s3)
## Client: Someone who use the resources such as User and pseudo users with assigned roles.

s3 = boto3.resource('s3',
                       region_name="ap-southeast-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='ap-southeast-2'
                  )

redshift = boto3.client('redshift',
                       region_name="ap-southeast-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

In [10]:
type(s3)  ## boto3 S3 object

boto3.resources.factory.s3.ServiceResource

In [11]:
type(iam)

botocore.client.IAM

In [12]:
type(redshift)

botocore.client.Redshift

In [13]:
## Connect to a S3 bucket in aws vpc through BOTO3 object S3 by creating bucket object.
bucket=s3.Bucket('udacity-dend')


## Access the log data files in the s3 bucket 'udacity-dend' , and filter the files based on the prefix.
log_data_files = [filename.key for filename in bucket.objects.filter(Prefix='log-data')]
log_data_files[:10]

['log-data/',
 'log-data/2018/11/2018-11-01-events.json',
 'log-data/2018/11/2018-11-02-events.json',
 'log-data/2018/11/2018-11-03-events.json',
 'log-data/2018/11/2018-11-04-events.json',
 'log-data/2018/11/2018-11-05-events.json',
 'log-data/2018/11/2018-11-06-events.json',
 'log-data/2018/11/2018-11-07-events.json',
 'log-data/2018/11/2018-11-08-events.json',
 'log-data/2018/11/2018-11-09-events.json']

In [14]:
## Access the song data files in the s3 bucket 'udacity-dend', and filter the files based on the prefix.
song_data_files = [filename.key for filename in bucket.objects.filter(Prefix='song-data/A')]
song_data_files[:10]

['song-data/A/A/A/TRAAAAK128F9318786.json',
 'song-data/A/A/A/TRAAAAV128F421A322.json',
 'song-data/A/A/A/TRAAABD128F429CF47.json',
 'song-data/A/A/A/TRAAACN128F9355673.json',
 'song-data/A/A/A/TRAAAEA128F935A30D.json',
 'song-data/A/A/A/TRAAAED128E0783FAB.json',
 'song-data/A/A/A/TRAAAEM128F93347B9.json',
 'song-data/A/A/A/TRAAAEW128F42930C0.json',
 'song-data/A/A/A/TRAAAFD128F92F423A.json',
 'song-data/A/A/A/TRAAAGR128F425B14B.json']

## Establishing connection with the Redshift DB

In [15]:
## Create Variables for redshift DB connectivity from config file.
LOG_DATA          = config.get("S3","LOG_DATA")
LOG_PATH          = config.get("S3","LOG_JSONPATH")
SONG_DATA         = config.get("S3","SONG_DATA")
IAM_ROLE          = config.get("IAM_ROLE","ARN")
HOST              = config.get("CLUSTER","HOST")

#### Connection between python and redshift can be done in multiple ways:

#### 1. Using psycopg2:  https://www.blendo.co/blog/access-your-data-in-amazon-redshift-and-postgresql-with-python-and-r/

#### 2. Using SQLAlchemy : 

#### 3. using simple SQL using URL as given below.

In [16]:
## Format for calling theURL: postgresql://username:password@host:port/databasename
## AWS Documentation for Copying Data to Redshift:  https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html#r_COPY-syntax-overview-data-source
## https://docs.aws.amazon.com/redshift/latest/dg/t_Loading_tables_with_the_COPY_command.html
conn_string = "postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, HOST, DWH_PORT, DWH_DB)

In [17]:
print(conn_string) # Check the format in the print output

postgresql://dwhuser:Passw0rd@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh


In [19]:
## The output should be 'Connected: user_name@db_name'. 
## Important: It only gets connected when we run the object conn_string or call the get request using the above link.

%sql $conn_string  

'Connected: dwhuser@dwh'

## Creating Staging Tables and Star Schema Tables to create a data warehouse

In [20]:
# DROP TABLES if they already exist.

staging_events_table_drop = "DROP TABLE IF EXISTS staging_events"   
staging_songs_table_drop  = "DROP TABLE IF EXISTS staging_songs"

%sql $staging_events_table_drop
%sql $staging_songs_table_drop

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
Done.
 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [21]:
## CREATE Staging TABLES: 

staging_events_table_create= ("""
CREATE TABLE IF NOT EXISTS staging_events
(
artist          VARCHAR,
auth            VARCHAR, 
firstName       VARCHAR,
gender          VARCHAR,   
itemInSession   INTEGER,
lastName        VARCHAR,
length          FLOAT,
level           VARCHAR, 
location        VARCHAR,
method          VARCHAR,
page            VARCHAR,
registration    BIGINT,
sessionId       INTEGER,
song            VARCHAR,
status          INTEGER,
ts              TIMESTAMP,
userAgent       VARCHAR,
userId          INTEGER
);
""")

staging_songs_table_create = ("""
CREATE TABLE IF NOT EXISTS staging_songs
(
song_id            VARCHAR,
num_songs          INTEGER,
title              VARCHAR,
artist_name        VARCHAR,
artist_latitude    FLOAT,
year               INTEGER,
duration           FLOAT,
artist_id          VARCHAR,
artist_longitude   FLOAT,
artist_location    VARCHAR
);
""")

In [22]:
%sql $staging_events_table_create
%sql $staging_songs_table_create

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
Done.
 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
Done.


[]

## Load the Staging Tables into the redshift Database by using COPY command.
### AWS Documentation for Copying Data to Redshift:  https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html#r_COPY-syntax-overview-data-source

### https://docs.aws.amazon.com/redshift/latest/dg/t_loading-tables-from-s3.html

### https://docs.aws.amazon.com/redshift/latest/dg/tutorial-loading-data.html

### The staging table is be used to create the data-warehouse (here we are creating only one datamart in data warehouse) having a star schema.

In [25]:
# STAGING TABLES

## Remember the S3 buckets we are calling here is in US.

staging_events_copy = ("""
    COPY staging_events FROM {}
    CREDENTIALS 'aws_iam_role={}'
    COMPUPDATE OFF region 'us-west-2'
    TIMEFORMAT as 'epochmillisecs'
    TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
    FORMAT AS JSON 'auto'
""").format(LOG_DATA, IAM_ROLE)


staging_songs_copy = ("""
    COPY staging_songs FROM {}
    CREDENTIALS 'aws_iam_role={}'
    COMPUPDATE OFF region 'us-west-2'
    FORMAT AS JSON 'auto' 
    TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL;
""").format(SONG_DATA, IAM_ROLE)

In [None]:
%sql $staging_events_copy
%sql $staging_songs_copy

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
Done.
 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh


In [None]:
%%sql 
select * from staging_songs limit 5

In [None]:
%%sql 
select * from staging_events limit 5