# Project: Data Warehouse
## Setting up AWS resources

In [1]:
import pandas as pd
import boto3
import json

### Loading DWH Params from a file

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# AWS
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

# IAM
IAM_ROLE_NAME = config.get('IAM_ROLE', 'ROLE_NAME')

# DWS
DWH_CLUSTER_TYPE = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')

DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')

DWH_IAM_ROLE_NAME      = config.get("IAM_ROLE", "ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,single-node
1,DWH_NUM_NODES,1
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwh-cfg
4,DWH_DB,dwh_db
5,DWH_DB_USER,dwh_db_akemi
6,DWH_DB_PASSWORD,Dwhakemi1
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,DWH_IAM_ROLE


### Creating clients and resources for IAM, EC2, S3, and Redshift


In [3]:
ec2 = boto3.resource('ec2', region_name='us-east-1', aws_access_key_id='AKIAVKINSJA5VKLJY3NB', aws_secret_access_key='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')

s3 = boto3.resource('s3', region_name='us-east-1', aws_access_key_id='AKIAVKINSJA5VKLJY3NB', aws_secret_access_key='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')

iam = boto3.client('iam', aws_access_key_id='AKIAVKINSJA5VKLJY3NB', aws_secret_access_key='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', region_name='us-east-1')

redshift = boto3.client('redshift', region_name='us-east-1', aws_access_key_id='AKIAVKINSJA5VKLJY3NB', aws_secret_access_key='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')

#### IAM ROLE
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [4]:
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(Path='/', 
                              RoleName='DWH_IAM_ROLE', 
                              Description = 'Allows Redshift clusters to call AWS services in your behalf', 
                              AssumeRolePolicyDocument=json.dumps({'Statement': [{'Action':'sts:AssumeRole', 
                                                                                  'Effect': 'Allow', 
                                                                                  'Principal': {'Service':'redshift.amazonaws.com'}}], 
                                                                   'Version': '2012-10-17'}))
    
except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name DWH_IAM_ROLE already exists.


In [5]:
#Attaching policy
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName='DWH_IAM_ROLE', 
                       PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess'
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [6]:
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName='DWH_IAM_ROLE')['Role']['Arn']

print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::365637552187:role/DWH_IAM_ROLE


#### Redshift Cluster


In [7]:
try:
    response = redshift.create_cluster(        
        ClusterType='single-node',
        NodeType='dc2.large',
        NumberOfNodes=int(1),

        DBName='dwh_db',
        ClusterIdentifier='dwh-cfg',
        MasterUsername='dwh_db_akemi',
        MasterUserPassword='Dwhakemi1',
        
        IamRoles=['arn:aws:iam::365637552187:role/DWH_IAM_ROLE'] 
    )
except Exception as e:
    print(e)

##### Checking cluster status

In [12]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier='dwh-cfg')['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwh-cfg
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwh_db_akemi
4,DBName,dwh_db
5,Endpoint,"{'Address': 'dwh-cfg.cfwcyzyylbus.us-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-08f78e4e705eb0bf4
7,NumberOfNodes,1


##### Cluster endpoint and ARN role

In [13]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwh-cfg.cfwcyzyylbus.us-east-1.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::365637552187:role/DWH_IAM_ROLE


##### Opening an incoming TCP port to access the cluster endpoint

In [14]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= 'default',  # TODO: fill out
        CidrIp='0.0.0.0/0',  # TODO: fill out
        IpProtocol='TCP',  # TODO: fill out
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-08c29c967d8ef4d22')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


##### Connecting the clusterConnect to the cluster

In [15]:
%load_ext sql

In [16]:
import sqlalchemy
print(sqlalchemy.__version__)  # deve ser >= 2.0


1.4.49


In [18]:
import redshift_connector
conn = redshift_connector.connect(
     host='dwh-cfg.cfwcyzyylbus.us-east-1.redshift.amazonaws.com',
     database='dwh_db',
     port=5439,
     user='dwh_db_akemi',
     password='Dwhakemi1'
  )
print("Conexão estabelecida com sucesso")

Conexão estabelecida com sucesso


## Cleaninup resources

In [2]:
# run to delete the created resources
redshift.delete_cluster( ClusterIdentifier='dwh-cfg',  SkipFinalClusterSnapshot=True)


NameError: name 'redshift' is not defined

In [8]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier='dwh-cfg')['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster my-redshift-cluster-1 not found.

In [7]:
# run to delete the created resources
iam.detach_role_policy(RoleName='DWH_IAM_ROLE', PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName='DWH_IAM_ROLE')


{'ResponseMetadata': {'RequestId': 'aa40b092-ac93-4b10-ad7b-f20bc56e138b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 01 May 2025 16:46:29 GMT',
   'x-amzn-requestid': 'aa40b092-ac93-4b10-ad7b-f20bc56e138b',
   'content-type': 'text/xml',
   'content-length': '200'},
  'RetryAttempts': 0}}