# Exercise 2: Creating Redshift Cluster using the AWS python SDK 
## An example of Infrastructure-as-code

In [1]:
import os
import pandas as pd
import boto3
import json

# STEP 0: Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Edit the file `dwh.cfg` in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>
<font/>


# Load DWH Params from a file

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = os.getenv("UDACITY_KEY")
SECRET                 = os.getenv("UDACITY_SECRET")

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Create clients for EC2, S3, IAM, and Redshift

In [3]:
ec2 = boto3.client(
    "ec2",
    region_name="us-west-2",
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

s3 = boto3.client(
    "s3",
    region_name="us-west-2",
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

iam = boto3.client(
    "iam",
    region_name="us-west-2",
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

redshift = boto3.client(
    "redshift",
    region_name="us-west-2",
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

## Check out the sample data sources on S3

In [4]:
for obj in s3.list_objects(Bucket="awssampledbuswest2", Prefix="ssbgz/")["Contents"]:
    print(obj["Key"])

ssbgz/
ssbgz/customer0002_part_00.gz
ssbgz/dwdate.tbl.gz
ssbgz/lineorder0000_part_00.gz
ssbgz/lineorder0001_part_00.gz
ssbgz/lineorder0002_part_00.gz
ssbgz/lineorder0003_part_00.gz
ssbgz/lineorder0004_part_00.gz
ssbgz/lineorder0005_part_00.gz
ssbgz/lineorder0006_part_00.gz
ssbgz/lineorder0007_part_00.gz
ssbgz/part0000_part_00.gz
ssbgz/part0001_part_00.gz
ssbgz/part0002_part_00.gz
ssbgz/part0003_part_00.gz
ssbgz/supplier.tbl_0000_part_00.gz
ssbgz/supplier0001_part_00.gz
ssbgz/supplier0002_part_00.gz
ssbgz/supplier0003_part_00.gz


## STEP 1: IAM ROLE
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [5]:
assume_role_doc = json.dumps({
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "redshift.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
})

dwhRole = iam.create_role(
    RoleName=DWH_IAM_ROLE_NAME,
    Description="Allow redshift to read from S3 Bucket",
    AssumeRolePolicyDocument=assume_role_doc
)

iam.attach_role_policy(
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
    RoleName=DWH_IAM_ROLE_NAME
)

# Copied from AWS Console
## https://console.aws.amazon.com/iam/home?region=us-west-1#/policies/arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess$serviceLevelSummary
## arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess


{'ResponseMetadata': {'RequestId': '1b8cab90-1641-4621-b37b-a5c9d5639d86',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1b8cab90-1641-4621-b37b-a5c9d5639d86',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Sat, 09 May 2020 17:01:23 GMT'},
  'RetryAttempts': 0}}

In [6]:
roleArn = iam.get_role(
    RoleName=DWH_IAM_ROLE_NAME 
)["Role"]["Arn"]

## STEP 2:  Redshift Cluster

- Create a RedShift Cluster
- For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [7]:
response = redshift.create_cluster(
    # add parameters for hardware
    DBName=DWH_DB,
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
    ClusterType=DWH_CLUSTER_TYPE,
    NodeType=DWH_NODE_TYPE,
    NumberOfNodes=int(DWH_NUM_NODES),
    # add parameters for identifiers & credentials
    MasterUsername=DWH_DB_USER,
    MasterUserPassword=DWH_DB_PASSWORD,
    # add parameter for role (to allow s3 access)
    IamRoles=[
      roleArn
    ]
)

In [14]:
DWH_CLUSTER_IDENTIFIER

'dwhCluster'

In [19]:
existing_cluster_list = []
for x in redshift.describe_clusters()["Clusters"]:
    existing_cluster_list.append(x["ClusterIdentifier"])

print(existing_cluster_list)

['dwhcluster']


## 2.1 *Describe* the cluster to see its status
- run this block several times until the cluster status becomes `Available`

In [10]:
metadata_of_interest = [
    "ClusterIdentifier",
    "NodeType",
    "ClusterStatus",
    "MasterUsername",
    "DBName",
    "Endpoint",
    "NumberOfNodes",
    "VpcId"
]

describe_dict = redshift.describe_clusters(
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
)["Clusters"][0]

{k: v for (k, v) in describe_dict.items() if k in metadata_of_interest}

{'ClusterIdentifier': 'dwhcluster',
 'NodeType': 'dc2.large',
 'ClusterStatus': 'available',
 'MasterUsername': 'dwhuser',
 'DBName': 'dwh',
 'Endpoint': {'Address': 'dwhcluster.ctcwpuiwdyvj.us-west-2.redshift.amazonaws.com',
  'Port': 5439},
 'VpcId': 'vpc-65761a1d',
 'NumberOfNodes': 4}

In [11]:
print("DWH_ENDPOINT: ", describe_dict["Endpoint"]["Address"])
print("DWH_ROLE_ARN: ", describe_dict["IamRoles"][0]["IamRoleArn"])
print("Role ARN: ", roleArn)

DWH_ENDPOINT = describe_dict["Endpoint"]["Address"]
DWH_ROLE_ARN = describe_dict["IamRoles"][0]["IamRoleArn"]

DWH_ENDPOINT:  dwhcluster.ctcwpuiwdyvj.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN:  arn:aws:iam::702717750718:role/dwhRole
Role ARN:  arn:aws:iam::702717750718:role/dwhRole


<h2> 2.2 Take note of the cluster <font color='red'> endpoint and role ARN </font> </h2>

<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

## STEP 3: Open an incoming  TCP port to access the cluster ednpoint

In [12]:
defaultSg = ec2.describe_security_groups(
    GroupNames=["default"]
)["SecurityGroups"][0]["GroupId"]


## https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#SecurityGroup:group-id=sg-2d57a950
responseSg = ec2.authorize_security_group_ingress(
    IpProtocol="TCP",
    CidrIp="0.0.0.0/0",
    FromPort=int(DWH_PORT),
    ToPort=int(DWH_PORT),
    GroupId=defaultSg
#     GroupName="defaultSg"
)

ClientError: An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists

## STEP 4: Make sure you can connect to the clusterConnect to the cluster

In [12]:
%load_ext sql

In [13]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.ctcwpuiwdyvj.us-west-2.redshift.amazonaws.com:5439/dwh


In [68]:
%%sql 

SELECT *
FROM information_schema.tables
WHERE table_schema = 'public'

 * postgresql://dwhuser:***@dwhcluster.ctcwpuiwdyvj.us-west-2.redshift.amazonaws.com:5439/dwh
7 rows affected.


table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_name
dwh,public,staging_events,BASE TABLE,,,,,
dwh,public,staging_songs,BASE TABLE,,,,,
dwh,public,songplays,BASE TABLE,,,,,
dwh,public,users,BASE TABLE,,,,,
dwh,public,songs,BASE TABLE,,,,,
dwh,public,artists,BASE TABLE,,,,,
dwh,public,timestamps,BASE TABLE,,,,,


## STEP 5: Clean up your resources

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE <br/> 
    We will be using these resources in the next exercises</span></b>

In [85]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'AllowVersionUpgrade': True,
  'AutomatedSnapshotRetentionPeriod': 1,
  'AvailabilityZone': 'us-west-2b',
  'ClusterCreateTime': datetime.datetime(2019, 2, 16, 6, 21, 30, 630000, tzinfo=tzutc()),
  'ClusterIdentifier': 'dwhcluster',
  'ClusterParameterGroups': [{'ParameterApplyStatus': 'in-sync',
    'ParameterGroupName': 'default.redshift-1.0'}],
  'ClusterSecurityGroups': [],
  'ClusterStatus': 'deleting',
  'ClusterSubnetGroupName': 'default',
  'ClusterVersion': '1.0',
  'DBName': 'dwh',
  'Encrypted': False,
  'Endpoint': {'Address': 'dwhcluster.csmamz5zxmle.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'EnhancedVpcRouting': False,
  'IamRoles': [{'ApplyStatus': 'in-sync',
    'IamRoleArn': 'arn:aws:iam::988332130976:role/dwhRole'}],
  'MasterUsername': 'dwhuser',
  'NodeType': 'dc2.large',
  'NumberOfNodes': 4,
  'PendingModifiedValues': {},
  'PreferredMaintenanceWindow': 'fri:10:30-fri:11:00',
  'PubliclyAccessible': True,
  'Tags': [],
  'VpcId': 'vpc-54d

- run this block several times until the cluster really deleted

In [59]:
{k: v for (k, v) in describe_dict.items() if k in metadata_of_interest}

{'ClusterIdentifier': 'dwhcluster',
 'NodeType': 'dc2.large',
 'ClusterStatus': 'available',
 'MasterUsername': 'dwhuser',
 'DBName': 'dwh',
 'Endpoint': {'Address': 'dwhcluster.chytho4frjsg.us-west-1.redshift.amazonaws.com',
  'Port': 5439},
 'VpcId': 'vpc-efc5f788',
 'NumberOfNodes': 4}

In [87]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)
#### CAREFUL!!

{'ResponseMetadata': {'HTTPHeaders': {'content-length': '200',
   'content-type': 'text/xml',
   'date': 'Sat, 16 Feb 2019 07:13:50 GMT',
   'x-amzn-requestid': '694f8d91-31ba-11e9-9438-d3ce9c613ef8'},
  'HTTPStatusCode': 200,
  'RequestId': '694f8d91-31ba-11e9-9438-d3ce9c613ef8',
  'RetryAttempts': 0}}