## Quality Check of the Data

In [1]:
## Libraries

%reload_ext sql     
## I have used reload intead of load because I ran this program multiple times. Simple load command will throw error when used multiple times.


import psycopg2     ## Psycopg2 is the most popular PostgreSQL database adapter for the Python programming language.
                    ## Since redshift is also a modified postgres db, thus it is used to access redshift from outside the VPC(virtual private cloud (aws)).
    
    
import pandas as pd  
import boto3        ## Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python.
                    ## It allows Python developers to write software that makes use of services like Amazon S3 ,IAM, and  Amazon EC2 etc.
    
    
import json          
import configparser  ## This is used to programmatically access the configuration file (dwh.cfg)

In [2]:
## Access the config file

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))





## Save the credentials in the form of variables

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")



## Create a dataframe for the configuration parameters, just to have a quick look.
pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwh-cluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


In [3]:
## Access clients and resources in aws vpc from jupyter

## Create boto3 sdk objects for connecting to aws S3 , IAM, and redshift through boto3, on behalf of the user (created and named as dwhadmin in aws).
## It (user: dwhadmin) has associated key and secret.

## boto3 actually offers two different styles of API ‘’Resource API’’ (high-level, recommended) and ‘’Client API ‘’(low-level). 
#### You can refer to the links for details. More about boto3: https://boto3.readthedocs.io/en/latest/reference/services/s3.html
#### https://medium.com/@rogerxujiang/use-s3-storage-on-aws-c4e5ce4fa46e

## --------------------------------------------------------------------

## Create an object s3 through which we can access the s3 buckets in aws.


s3 = boto3.resource('s3',
                       region_name="ap-southeast-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )


## Create an object iam through which we can access the iam roles in aws. It has not been used , but created just to show the procedure.


iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='ap-southeast-2'
                  )


## Create an object redshift through which we can access redshift in aws. It has not been used , but created just to show the procedure.


redshift = boto3.client('redshift',
                       region_name="ap-southeast-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

In [4]:
## Create Variables for redshift DB connectivity from config file.
LOG_DATA          = config.get("S3","LOG_DATA")
LOG_PATH          = config.get("S3","LOG_JSONPATH")
SONG_DATA         = config.get("S3","SONG_DATA")
IAM_ROLE          = config.get("IAM_ROLE","ARN")
HOST              = config.get("CLUSTER","HOST")

In [5]:
## Format for calling theURL: postgresql://username:password@host:port/databasename
## AWS Documentation for Copying Data to Redshift:  https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html#r_COPY-syntax-overview-data-source
## https://docs.aws.amazon.com/redshift/latest/dg/t_Loading_tables_with_the_COPY_command.html



conn_string = "postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, HOST, DWH_PORT, DWH_DB)

In [6]:
print(conn_string) # Check the format in the print output

postgresql://dwhuser:Passw0rd@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh


In [7]:
%sql $conn_string  

'Connected: dwhuser@dwh'

In [8]:
%%sql 
select * from staging_songs limit 5

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOABWAP12A8C13F82A,1,Take Time,Chaka Khan_ Rufus,41.88415,1978,258.89914,AR5LMPY1187FB573FE,-87.63241,"Chicago, IL"
SONYRZV12AB018AF70,1,Burning In The Aftermath,The Suicide Machines,42.33168,2003,95.68608,ARWYVP51187B98C516,-83.04792,"Detroit, MI"
SOCHGUG12A58A7E184,1,Love Gives Love Takes ( LP Version ),The Corrs,,1997,222.17098,AROVU6Z1187B9AE74E,,
SOOBEML12A8C138C91,1,Johnny Leary's Polka_ O'Keefe's Polka_ Johnny I do Miss You,De Dannan,,0,197.642,ARP4O0W1187FB5A06B,,
SOXXQCX12A8C1370EA,1,Quieres Volar Sin Mi,Omar Geles,,0,270.94159,ARG0XZL1187B9B6CBF,,


In [None]:
## Query the DWH Now:

In [11]:
%%sql 
select count(*) as numRows from staging_events 

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


numrows
8056


In [12]:
%%sql 
select count(*) as numRows from staging_songs 

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


numrows
14896


In [13]:
%%sql 
select count(Distinct song_id) as numSongId , count(Distinct artist_id) as numArtistId from staging_songs

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


numsongid,numartistid
14896,9553


In [14]:
%%sql 
select count(Distinct song_id) as numSongId , count(Distinct artist_id) as numArtistId from dim_song

 * postgresql://dwhuser:***@dwh-cluster.culrjdfmjjzn.ap-southeast-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


numsongid,numartistid
14896,9553
