In [1]:
import pandas as pd
import boto3
import json

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

### 1. Create  new role / attach policies, create Redshift cluster and add HOST and ARN arguments to config file

In [3]:
try:
    # IAM user KEY/SECRET pair with administrator access (added from AWS's GUI)
    KEY = config.get('AWS','KEY')
    SECRET = config.get('AWS','SECRET')
    
    if KEY == '':
        print('Have you added your KEY credentials to the config file?')
    if SECRET == '':
        print('Have you added your SECRET credentials to the config file?')
        
except Exception as e:
    print(e)
    

Have you added your KEY credentials to the config file?
Have you added your SECRET credentials to the config file?


In [3]:
# IAM role name
DWH_IAM_ROLE_NAME = config.get('IAM_ROLE','DWH_IAM_ROLE_NAME')

# DWH Hardware
DWH_CLUSTER_TYPE = config.get('DWH_HW','DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH_HW','DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH_HW','DWH_NODE_TYPE')

# DWH Identifiers and Credentials
DWH_CLUSTER_IDENTIFIER = config.get('CLUSTER','CLUSTER_IDENTIFIER')
DWH_DB = config.get('CLUSTER','DB_NAME')
DWH_DB_USER = config.get('CLUSTER','DB_USER')
DWH_DB_PASSWORD = config.get('CLUSTER','DB_PASSWORD')
DWH_PORT = config.get('CLUSTER','DB_PORT')

In [4]:
redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

In [5]:
# Add new role name and policy to read S3
try:
    # Create role
    dwhRole = iam.create_role(
            Path='/',
            RoleName = DWH_IAM_ROLE_NAME,
            Description = "Allows Redshift clusters to call AWS services on your behalf.",
            AssumeRolePolicyDocument=json.dumps(
                {'Statement': [{'Action': 'sts:AssumeRole',
                   'Effect': 'Allow',
                   'Principal': {'Service': 'redshift.amazonaws.com'}}],
                   'Version': '2012-10-17'}),
    )

except Exception as e:
    print(e)

An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.


In [6]:
# Attach policy to read S3
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

# Attach policy for Redshift access
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonRedshiftFullAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

# Attach policy for Redshift read only access (required to acces the query editor apparently)
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonRedshiftReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

# Attach policy for Redshift query editor
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonRedshiftQueryEditor"
                      )['ResponseMetadata']['HTTPStatusCode']

# Attach administrator access policy
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess"
                      )['ResponseMetadata']['HTTPStatusCode']


# Show role arn
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
print(roleArn)

arn:aws:iam::429665464532:role/dwhRole


In [7]:
# Create redshift cluster
try:
    response = redshift.create_cluster(        
        # Hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        # Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        # Roles (for S3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

In [8]:
status = ''
while status != 'available':
    try:
        desc_cluster = redshift.describe_clusters(
            ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
        )['Clusters'][0]
        
        status = desc_cluster['ClusterStatus']

    except Exception as e:
        print(e)
        break
        
        
HOST = desc_cluster['Endpoint']['Address']
ARN = desc_cluster['IamRoles'][0]['IamRoleArn']

# Add these two in the config file
print(HOST)
print(ARN)

#####################################################################################################################
# Write to config file
# config.set('CLUSTER','HOST',HOST)
# config.set('IAM_ROLE','ARN',ARN)

# with open('dwh.cfg', 'w') as conf:
#     config.write(conf)

#####################################################################################################################
# def prettyRSProps(props):
#     keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus",
#                   "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']

#     x = [(k, v) for k,v in props.items() if k in keysToShow]
    
#     return pd.DataFrame(data=x, columns=["Key", "Value"])

dwhcluster.ctxd38v4otyi.us-west-2.redshift.amazonaws.com
arn:aws:iam::429665464532:role/dwhRole


In [9]:
ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

# Security settings

try:
    vpc = ec2.Vpc(id=desc_cluster['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-929b68ae')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


### 2. Connect to DB to perform sample queries
##### Run this bit after running create_tables.py and etl.py

In [16]:
%load_ext sql

In [17]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, HOST, DWH_PORT, DWH_DB)
print(conn_string)

postgresql://dwhuser:aws_Password2@dwhcluster.ctxd38v4otyi.us-west-2.redshift.amazonaws.com:5439/dwh


In [18]:
%sql $conn_string

'Connected: dwhuser@dwh'

In [19]:
%sql SELECT * FROM songplays WHERE artist_id IS NOT NULL AND song_id IS NOT NULL LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.ctxd38v4otyi.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
1482,2018-11-11 20:08:52,80,paid,SOBNYLJ12AB0189934,AREIK8S1187B9A6071,435,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
2442,2018-11-27 18:52:58,28,free,SOEKSGJ12A67AE227E,ARQUMH41187B9AF699,270,"Portland-Vancouver-Hillsboro, OR-WA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
977,2018-11-15 12:28:39,30,paid,SOIOESO12A6D4F621D,ARVLXWP1187FB5B94A,324,"San Jose-Sunnyvale-Santa Clara, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0
2001,2018-11-10 09:29:42,44,paid,SOWLLXC12AB0180FFE,AR66PLO1187FB4C8E5,350,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
2385,2018-11-22 13:21:20,101,free,SOUHTWB12A8C13BA4D,AR96LYR1187B9ABABD,790,"New Orleans-Metairie, LA","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""


In [20]:
%sql SELECT * FROM users LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.ctxd38v4otyi.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


user_id,first_name,last_name,gender,level
62,Connar,Moreno,M,free
37,Jordan,Hicks,F,free
81,Sienna,Colon,F,free
92,Ryann,Smith,F,free
23,Morris,Gilmore,M,free


In [21]:
%sql SELECT * FROM time LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.ctxd38v4otyi.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


start_time,hour,day,week,month,weekday,year
2018-11-03 01:12:26,1,3,44,11,6,2018
2018-11-03 15:59:38,15,3,44,11,6,2018
2018-11-03 16:34:03,16,3,44,11,6,2018
2018-11-03 16:54:02,16,3,44,11,6,2018
2018-11-03 17:10:11,17,3,44,11,6,2018


In [22]:
# Checks for errors
# %sql SELECT * FROM stl_load_errors;

### 3. Kill cluster and detach policies from role

In [27]:
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DeleteCluster operation: Cluster dwhcluster not found.

In [26]:
status = 'deleting'
while status == 'deleting':
    try:
        desc_cluster = redshift.describe_clusters(
            ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
        )['Clusters'][0]
        
        status = desc_cluster['ClusterStatus']

    except Exception as e:
        status='deleted'
        print('Cluster ' + status)
        print(e)
        

Cluster deleted
An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.


In [34]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")

DeleteConflictException: An error occurred (DeleteConflict) when calling the DeleteRole operation: Cannot delete entity, must detach all policies first.

In [None]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonRedshiftFullAccess")

In [None]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonRedshiftQueryEditor")

In [None]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess")

In [None]:
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)