In [3]:
import pandas as pd
import boto3
import json
import configparser
from time import sleep

In [47]:


@unique
class TableType(Enum):
    FACT = 'fact'
    DIM = 'dim'
    STAGE = 'staging'


@unique
class Table(Enum):
    BUSINESS = 'business'
    CITY = 'city'
    REVIEW = 'review'
    TIP = 'tip'
    USERS = 'users'
    STOCK = 'stock'

    def get_table_name(self, table_type: TableType):
        return f"{self.name}_{table_type.value}"

    def get_partitions(self):
        return {
            self.USERS: {'YEAR': 2004, 'MONTH': 10, 'DAY': 12},
            self.REVIEW: {'YEAR': 2005, 'MONTH': 3, 'DAY': 3},
            self.TIP: {'YEAR': 2009, 'MONTH': 12, 'DAY': 15}
        }.get(self)

    def get_s3_path(self):
        if self == self.BUSINESS:
            return "s3://yelp-customer-reviews/processed/business/"
        elif self == self.STOCK:
            return "s3://yelp-customer-reviews/stock-data/cmg.us.txt"
        else:
            path = f"s3://yelp-customer-reviews/data-lake/{self.value}".replace('users', 'user')
            path = path + "/pyear={YEAR}/pmonth={MONTH}/pday={DAY}"
            return path.format(**self.get_partitions())

In [48]:
Table.BUSINESS.get_table_name(TableType.STAGE)

'BUSINESS_staging'

In [49]:
Table.USERS.get_s3_path()

's3://yelp-customer-reviews/data-lake/user/pyear=2004/pmonth=10/pday=12'

In [29]:
from enum import Enum, unique
@unique
class SqlQueries(Enum):

    setup_foreign_keys = ("""
    ALTER TABLE "tip_fact" ADD FOREIGN KEY ("business_id") REFERENCES "business_fact" ("business_id");
    ALTER TABLE "tip_fact" ADD FOREIGN KEY ("user_id") REFERENCES "users_fact" ("user_id");
    ALTER TABLE "business_fact" ADD FOREIGN KEY ("city_id") REFERENCES "city_fact" ("city_id");
    ALTER TABLE "review_dim" ADD FOREIGN KEY ("business_id") REFERENCES "business_fact" ("business_id");
    ALTER TABLE "review_dim" ADD FOREIGN KEY ("user_id") REFERENCES "users_fact" ("user_id");
    ALTER TABLE "review_fact" ADD FOREIGN KEY ("review_id") REFERENCES "review_dim" ("review_id");
    ALTER TABLE "stock_fact" ADD FOREIGN KEY ("business_name") REFERENCES "business_fact" ("name");
    """)

    business_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "business_fact" (
        "business_id" varchar PRIMARY KEY,
        "name" varchar,
        "categories" varchar,
        "review_count" bigint,
        "stars" count,
        "city_id" varchar,
        "address" varchar,
        "postal_code" varchar
        );
        DISTSTYLE EVEN;
    """)

    city_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "city_fact" (
        "city_id" varchar PRIMARY KEY,
        "state" varchar,
        "city" varchar
        );

    """)

    users_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "users_fact" (
        "user_id" varchar PRIMARY KEY,
        "yelping_since" timestamp,
        "name" varchar,
        "average_stars" int,
        "review_count" bigint
        );
        DISTSTYLE EVEN;
    """)

    review_dim_create = ("""
        CREATE TABLE IF NOT EXISTS "review_dim" (
        "review_id" varchar PRIMARY KEY,
        "review_date" timestamp,
        "business_id" varchar,
        "user_id" varchar
        );
        DISTSTYLE EVEN;
    """)

    review_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "review_fact" (
        "review_id" varchar PRIMARY KEY,
        "stars" int,
        "text" varchar
        );
        DISTSTYLE EVEN;
    """)

    stock_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "stock_fact" (
        "stock_id" varchar PRIMARY KEY,
        "business_name" varchar,
        "date" timestamp,
        "close_value" float
        );
    """)

    tip_fact_create = ("""
        CREATE TABLE IF NOT EXISTS "tip_fact" (
        "tip_id" varchar PRIMARY KEY,
        "business_id" varchar,
        "user_id" varchar,
        "text" varchar,
        "tip_date" timestamp,
        "compliment_count" bigint
        );
        DISTSTYLE EVEN;
    """)

    review_stage_create = ("""
        CREATE TABLE IF NOT EXISTS "review_staging" (
        "business_id" varchar
        "cool" bigint,
        "funny" bigint,
        "review_id" varchar,
        "stars" double,
        "text" varchar,
        "useful" bigint,
        "user_id" string,
        "dt" varchar
        );
    """)
    business_stage_create = ("""
        CREATE TABLE IF NOT EXISTS "business_staging" (
        "business_id" varchar,
        "categories" varchar,
        "state" varchar,
        "city" varchar,
        "address" varchar,
        "postal_code" string,
        "review_count" bigint,
        "stars" double
        );
    """)
    tip_stage_create = ("""
        CREATE TABLE IF NOT EXISTS "tip_staging" (
        "business_id" varchar,
        "compliment_count" bigint,
        "text" varchar,
        "user_id" varchar,
        "dt" varchar
        );
    """)
    users_stage_create = ("""
        CREATE TABLE IF NOT EXISTS "users_staging" (
        "average_stars" varchar 
        "compliment_cool" bigint,
        "compliment_cute" bigint,
        "compliment_funny" bigint,
        "compliment_hot" bigint,
        "compliment_list" bigint,
        "compliment_more" bigint,
        "compliment_note" bigint,
        "compliment_photos" bigint,
        "compliment_plain" bigint,
        "compliment_profile" bigint,
        "compliment_writer" bigint,
        "cool" bigint,
        "elite" varchar,
        "fans" bigint,
        "friends" varchar,
        "funny" bigint,
        "name" varchar,
        "review_count" bigint,
        "useful" bigint,
        "user_id" varchar,
        "yelping_since" varchar
        );
    """)

    stock_stage_create = ("""
        CREATE TABLE IF NOT EXISTS "stock_staging" (
        "Date" varchar,
        "Open" double,
        "High" double,
        "Low" double,
        "Close" double,
        "Volume" bigint,
        "OpenInt" bigint
        );
    """)

    users_fact_insert = ("""
        INSERT INTO users_fact (
            user_id,
            yelping_since,
            name,
            average_stars,
            review_count
            )
        SELECT distinct 
            user_id, 
            CAST(yelping_since as timestamp) AS yelping_since,
            name, 
            average_stars, 
            review_count
        FROM users_staging
    """)

    business_fact_insert = ("""
        INSERT INTO business_fact (
            business_id,
            name,
            categories,
            review_count,
            stars,
            city_id,
            address,
            postal_code
            )
        SELECT distinct 
            business_id,
            name,
            categories,
            review_count,
            stars,
            b.city_id,
            address,
            postal_code
        FROM business_staging a
        LEFT JOIN city_fact b ON a.city = b.city AND a.state = b.state
    """)

    city_fact_insert = ("""
        INSERT INTO city_fact (
            city_id,
            state,
            city
            )
        SELECT distinct
            md5(state || city) city_id,
            state,
            city
        FROM business_staging
    """)

    review_dim_insert = ("""
        INSERT INTO review_dim (
            review_id,
            review_date,
            business_id,
            user_id
            )
        SELECT distinct
            review_id,
            CAST(dt as timestamp) AS review_date,
            business_id,
            user_id
        FROM review_staging
    """)

    review_fact_insert = ("""
        INSERT INTO review_fact (
            review_id,
            stars,
            text
            )
        SELECT distinct
            review_id,
            stars,
            text
        FROM review_staging
    """)

    tip_fact_insert = ("""
        INSERT INTO tip_fact (
            tip_id,
            business_id,
            user_id,
            text,
            tip_date,
            compliment_count
            )
        SELECT distinct
            md5(business_id || user_id || tip_date)  tip_id,
            business_id,
            user_id,
            text,
            CAST(dt as timestamp) AS tip_date,
            compliment_count
        FROM tip_staging
    """)

    stock_fact_insert = ("""
        INSERT INTO stock_fact (
            stock_id,
            business_name,
            date,
            close_value
            )
        SELECT distinct
            md5('cmg' || date ) stock_id,
            'chipotle' AS business_name,
            Date,
            Close
        FROM stock_staging
    """)


In [38]:
setup_database_dict = {
    query.name: query.value for query in SqlQueries if ('create' in query.name)
    
}
setup_database_dict[SqlQueries.setup_foreign_keys.name]= SqlQueries.setup_foreign_keys.value
setup_database_dict.keys()

dict_keys(['business_fact_create', 'city_fact_create', 'users_fact_create', 'review_dim_create', 'review_fact_create', 'stock_fact_create', 'tip_fact_create', 'review_stage_create', 'business_stage_create', 'tip_stage_create', 'users_stage_create', 'stock_stage_create', 'setup_foreign_keys'])

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.0.5-cp38-cp38-manylinux1_x86_64.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 1.7 MB/s eta 0:00:01    |███████████████████▎            | 6.0 MB 1.7 MB/s eta 0:00:03
Collecting numpy>=1.13.3
  Downloading numpy-1.19.0-cp38-cp38-manylinux2010_x86_64.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 5.6 MB/s eta 0:00:01     |██████████████████████▏         | 10.1 MB 5.5 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 8.0 MB/s eta 0:00:01
Installing collected packages: numpy, pytz, pandas
Successfully installed numpy-1.19.0 pandas-1.0.5 pytz-2020.1


## Load Config Parameters
The file `dwh.cfg` contains all parameers necessary to proceed with the Cluster creation.
In addition to it, the AWS IAM Roles parameters are also defined in the file

In [16]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Create Clients

In [18]:
ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       ) 

In [19]:
from botocore.exceptions import ClientError
try:
    print("Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)    
print("Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
print(roleArn)

Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.
Attaching Policy
Get the IAM role ARN
arn:aws:iam::500349149336:role/dwhRole


## Creating a Redshift Cluster

In [20]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)
    
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

cluster_status = 'Undefined'
while cluster_status != 'available':
    sleep(30)
    myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
    cluster_status = myClusterProps['ClusterStatus']
    

prettyRedshiftProps(myClusterProps)

DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

  pd.set_option('display.max_colwidth', -1)


DWH_ENDPOINT ::  dwhcluster.cbynmx43gg9v.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::500349149336:role/dwhRole
ec2.SecurityGroup(id='sg-00282197ebcc2d058')
An error occurred (UnauthorizedOperation) when calling the AuthorizeSecurityGroupIngress operation: You are not authorized to perform this operation. Encoded authorization failure message: d_ZbjBHYqrZgpH_OHyF6TrFGwyb4o_F6gpcoXb3_xsWA-Gl-_eWE4d2L8bv-apsOQHRqTReMoLcqbjrWG9q8jw5NWLF9X0CXhG7i5Tb6WCdJhIQiyoK60ZxF6nbB5s9XEulsOz75q24Q762ex6HaIrL0I2WPLjDkhOcnUJLo1P6vnCbV7o1eRul0LvIXiWHO8FgkWdMMCKV6cbyemf-0wAs-2ImgjMf6QJUeuWKdBbxj5ajXIcCd3DRsYSTEWEfIt0wrM9rcP-vl7xVbFz4D_6QGAYHeeIA7tlLbvK8GZMWT0qGQKjyvELgd0F7e2pjr3mNgOH94jQCMe3J3_ATZhqtCAV2J5pwrIels7zZujJSn6VJbJe5Hvp-W1jG3CyIi3x78E_gwGdVxWrni7yvXkrvJKC6aUEeHyyXszcrEPUP77RE1MpEPPchN1L8xG4lxYyRmxJLXxy8wBifQ2YplMTqxx-XpclYfQHLwzj_GVa3mNKjDjEF1E-q2kZFHzykw0oP9DSZErOR66_9MiHYb6U6JK6qfk8lhd8Gj_FoPV6fvnjbBDxZUbvl_B8OnsCdL-qSFoPlDKMbOiEOHHDv4sNY


In [21]:
print("host={} dbname={} user={} password={} port={}".format(DWH_ENDPOINT, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT))

host=dwhcluster.cbynmx43gg9v.us-west-2.redshift.amazonaws.com dbname=dwh user=dwhuser password=Passw0rd port=5439


## Delete Cluster

In [None]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!