# Exercise 1: Creating Redshift Cluster using the AWS python SDK 

In [1]:
import pandas as pd
from typing import List
import boto3
import json
import pprint
import configparser

#### Load DWH Params from config file

In [21]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_REGION_NAME        = config.get("DWH","DWH_REGION_NAME") 
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

pd.DataFrame({
    "Param": ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_REGION_NAME"],
    "Value": [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_REGION_NAME]
    }
)

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dend-airflow-cluster-project-z
4,DWH_DB,dev
5,DWH_DB_USER,admin
6,DWH_DB_PASSWORD,Udacitynanodedegree12
7,DWH_PORT,5439
8,DWH_REGION_NAME,us-west-2


#### Create clients for EC2, S3, IAM, and Redshift

In [3]:
ec2 = boto3.resource(
      'ec2', 
      region_name=DWH_REGION_NAME,
      aws_access_key_id=KEY,
      aws_secret_access_key=SECRET
)

s3 = boto3.resource(
      's3', 
      region_name=DWH_REGION_NAME,
      aws_access_key_id=KEY,
      aws_secret_access_key=SECRET
)

iam = boto3.client(
      'iam', 
      region_name=DWH_REGION_NAME,
      aws_access_key_id=KEY,
      aws_secret_access_key=SECRET
)

redshift = boto3.client(
      'redshift', 
      region_name=DWH_REGION_NAME,
      aws_access_key_id=KEY,
      aws_secret_access_key=SECRET
)

### Check out the sample data sources on S3

In [57]:
sample_db_bucket =  s3.Bucket("udacity-dend")

for object in sample_db_bucket.objects.filter(Prefix="song_data"):
    print(object)

s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAK128F9318786.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAV128F421A322.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAABD128F429CF47.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAACN128F9355673.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEA128F935A30D.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAED128E0783FAB.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEM128F93347B9.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEW128F42930C0.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAFD128F92F423A.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAGR128F425B14B.json')
s3.ObjectSummary(

KeyboardInterrupt: 

### IAM ROLE
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [4]:
def delete_role(iam, role_name: str, policy_names: List[str]):
    
    # Detach policy from role
    for policy_name in policy_names:
        policy_arn = f"arn:aws:iam::aws:policy/{policy_name}"
        try:            
            detach_policy_response = iam.detach_role_policy(
                RoleName=role_name,
                PolicyArn=policy_arn
            )
            print(f"Detached policy {policy_name}")
        except Exception as e:
            print(e)

    # Delete role
    try:
        delete_role_response = iam.delete_role(
            RoleName=role_name
        )
        print(f"Deleted role {role_name}")
    except Exception as e:
        print(e)

In [22]:
# Setup policy document
assume_role_policy_document = dict(
    Statement=[dict(
        Action='sts:AssumeRole',
        Effect='Allow',
        Principal=dict(Service='redshift.amazonaws.com')
    )],
    Version='2012-10-17'
)

# Set role name & policy
role_name = 'airflow-access'
role_policies = ['AmazonS3FullAccess', 'AmazonRedshiftFullAccess']

# Create new role
try:
    create_role_response = iam.create_role(
        Path='/',
        RoleName=role_name,
        Description='Allows Redshift clusters to call AWS services on your behalf',
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
    )
    
    print(f"Created a new IAM Role named {role_name}")

except Exception as e:
    print(e)

# Attach role policy 
try:
    for role in role_policies:
        attach_role_response = iam.attach_role_policy(
            RoleName=role_name,
            PolicyArn=f'arn:aws:iam::aws:policy/{role}'
        )
        print(f"Attached policy {role} to {role_name}")
except Exception as e:
    print(e)

# Get role  arn
try:
    get_role_response = iam.get_role(
        RoleName=role_name
    )
    print(f"IAM role ARN equals to {get_role_response['Role']['Arn']}")
except Exception as e:
    print(e)

An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name airflow-access already exists.
Attached policy AmazonS3FullAccess to airflow-access
Attached policy AmazonRedshiftFullAccess to airflow-access
IAM role ARN equals to arn:aws:iam::110635350229:role/airflow-access


### Redshift Cluster

In [30]:
try:
    create_cluster_response = redshift.create_cluster(        
        # Define hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        # Set identifiers and credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,

        # Set role
        IamRoles=[get_role_response['Role']['Arn']]
    )
    print(f"Cluster {DWH_CLUSTER_IDENTIFIER} created")
except Exception as e:
    print(e)

Cluster dend-airflow-cluster-project-z created


### Describe the cluster to see its status

In [33]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dend-airflow-cluster-project-z
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,admin
4,DBName,dev
5,Endpoint,"{'Address': 'dend-airflow-cluster-project-z.ceuboc2dr9on.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0f2ab756b94cf7742
7,NumberOfNodes,4


### Set cluster endpoint and role ARN

In [36]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dend-airflow-cluster-project-z.ceuboc2dr9on.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::110635350229:role/airflow-access


### Open an incoming  TCP port to access the cluster endpoint

In [13]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0', 
        IpProtocol='TCP',  
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-00b62716edb70c140')


### Make sure we can connect to the clusterConnect to the cluster

In [14]:
%load_ext sql
conn_string=f"postgresql://{DWH_DB_USER}:{DWH_DB_PASSWORD}@{DWH_ENDPOINT}:{DWH_PORT}/{DWH_DB}"
print(conn_string)
%sql $conn_string

postgresql://admin:Udacitynanodedegree12@udacity-nano-de-demo-cluster.ceuboc2dr9on.us-west-2.redshift.amazonaws.com:5439/udacity_nano_de_dwh


### Clean up your resources

In [38]:
# Uncomment & run to delete the created cluster
# redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

In [43]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dend-airflow-cluster-prods
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,admin
4,DBName,dev
5,Endpoint,"{'Address': 'dend-airflow-cluster-prods.ceuboc2dr9on.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0f2ab756b94cf7742
7,NumberOfNodes,4


In [59]:
# Uncomment & run to delete the created role
delete_role(iam, 'airflow-access', ['AmazonS3FullAccess', 'AmazonRedshiftFullAccess'])

Detached policy AmazonS3FullAccess
Detached policy AmazonRedshiftFullAccess
Deleted role airflow-access
