In [None]:
#sql_tables.py

"""
This file contains all the SQL definitions and control Lists used by "create_tables.py" and "etl.py"
"""

import configparser

# CONFIG
config = configparser.ConfigParser()
config.read('dwh.cfg')

# DROP TABLES

staging_events_table_drop = "DROP TABLE IF EXISTS staging_events"
staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs"
songplay_table_drop = "DROP TABLE IF EXISTS songplays"
user_table_drop = "DROP TABLE IF EXISTS users"
song_table_drop = "DROP TABLE IF EXISTS songs"
artist_table_drop = "DROP TABLE IF EXISTS artist"
time_table_drop = "DROP TABLE IF EXISTS time"

# CREATE TABLES__________________________________________________________________________________________________________________________________________________________________________________________________

# STAGING table creation________________________________________

staging_songs_table_create = ("""
CREATE TABLE staging_songs 
(
  num_songs     VARCHAR NOT NULL,
  artist_id       VARCHAR,
  artist_latitude        VARCHAR,
  artist_longitude   VARCHAR,
  artist_location      VARCHAR,
  artist_name       VARCHAR,
  song_id        VARCHAR,
  title        VARCHAR, 
  duration    VARCHAR,
  year       VARCHAR  
);
""")

staging_events_table_create= ("""
CREATE TABLE staging_events 
(
  artist     VARCHAR,
  auth       VARCHAR,
  firstName        VARCHAR,
  gender   VARCHAR,
  iteminSession      VARCHAR,
  lastName       VARCHAR,
  length        VARCHAR,
  level        VARCHAR, 
  location    VARCHAR,
  method    VARCHAR,
  page    VARCHAR,
  registration    VARCHAR,
  sessionid    VARCHAR,
  song    VARCHAR,
  status    VARCHAR,
  ts    VARCHAR,
  userAgent    VARCHAR,
  userId       VARCHAR  
);
""")

#Warehouse tables creation ______________________________________

songplay_table_create = ("""

CREATE TABLE IF NOT EXISTS songplays (
songplay_id int identity(0,1) PRIMARY KEY, 
start_time varchar, 
user_id int NOT NULL, 
song_id varchar, 
artist_id varchar, 
session_id int sortkey, 
location varchar, 
user_agent varchar,
level varchar
);
""")

user_table_create = ("""

CREATE TABLE IF NOT EXISTS users (
user_id int not null sortkey, 
first_name varchar, 
last_name varchar, 
gender varchar, 
level varchar 
);
""")

song_table_create = ("""

CREATE TABLE IF NOT EXISTS songs (
song_id varchar not null sortkey, 
title varchar, 
artist_id varchar NOT NULL, 
year int, 
duration numeric NOT NULL 
);
""")

artist_table_create = ("""

CREATE TABLE IF NOT EXISTS artists (
artist_id varchar not null sortkey, 
name varchar, 
location varchar DEFAULT 'None', 
lattitude float DEFAULT 0, 
longitude float DEFAULT 0
);
""")

time_table_create = ("""

CREATE TABLE IF NOT EXISTS time (
time_id int identity(0,1) PRIMARY KEY , 
start_time TIMESTAMP not null sortkey, 
hour int,
day int, 
week int, 
month int, 
year int, 
weekday int 
);
""")


# STAGING TABLES population _____________________________________________________________________________________________________________________________________________________________________________________

# Copy Commands to populate staging tables from S3

staging_events_copy = ("""
COPY staging_events FROM {}
    credentials 'aws_iam_role={}'
    REGION 'us-west-2'
    FORMAT AS JSON {};
""").format(config['S3']['LOG_DATA'], config['IAM_ROLE']['ARN'], config['S3']['LOG_JSONPATH'])

staging_songs_copy = ("""
    COPY staging_songs FROM {}
    credentials 'aws_iam_role={}'
    JSON 'auto'
    REGION 'us-west-2'
    COMPUPDATE OFF;
""").format(config['S3']['SONG_DATA'], config['IAM_ROLE']['ARN'])

 
# WAREHOUSE TABLES population ___________________________________________________________________________________________________________________________________________________________________________________

songplay_table_insert = ("""
INSERT INTO songplays (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) \
SELECT \
TIMESTAMP WITH TIME ZONE 'epoch' + CAST(staging_events.ts AS NUMERIC) * INTERVAL '1 Second ', \
CAST(staging_events.userId AS INT), \
staging_events.level, \
staging_songs.song_id, \
staging_songs.artist_id, \
CAST(staging_events.sessionid AS INT), \
staging_events.location, \
staging_events.userAgent \
FROM staging_events join staging_songs on staging_events.artist = staging_songs.artist_name and staging_events.song = staging_songs.title 

""")

user_table_insert = ("""
INSERT INTO users (user_id , first_name, last_name, gender, level) SELECT DISTINCT CAST(userId AS INT), firstName, lastName, gender, level FROM staging_events where userId != ' '; 
""")

song_table_insert = ("""
INSERT INTO songs (song_id, title, artist_id, year, duration) SELECT song_id, title, artist_id, CAST(year AS INT), CAST(duration AS NUMERIC) FROM staging_songs;
""")

artist_table_insert = ("""
INSERT INTO artists (artist_id, name, location, lattitude, longitude) SELECT DISTINCT artist_id, artist_name, artist_location , CAST(artist_latitude AS FLOAT), CAST(artist_longitude AS FLOAT) FROM staging_songs;
""")

time_table_insert = ("""
INSERT INTO time (start_time, hour, day, week, month, year, weekday) \
SELECT \
TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC) * INTERVAL '1 Second ' AS c_Time, \
extract(hr from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC) * INTERVAL '1 Second ')) AS c_Hour, \
extract(day from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC)  * INTERVAL '1 Second ')) AS c_Day, \
extract(w from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC)  * INTERVAL '1 Second ')) AS c_Week, \
extract(mon from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC) * INTERVAL '1 Second '))AS c_Month, \
extract(y from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC) * INTERVAL '1 Second ')) AS c_Year, \
extract(weekday from (TIMESTAMP WITH TIME ZONE 'epoch' + CAST(ts AS NUMERIC) * INTERVAL '1 Second ')) AS c_Weekday \
FROM staging_events
""")

# QUERY LISTS_____________________________________________________________________________________________________________________________________________________________________________________________________  

drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]
create_table_queries = [staging_songs_table_create, staging_events_table_create, songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create]
copy_table_queries = [staging_songs_copy, staging_events_copy]
insert_table_queries = [songplay_table_insert, artist_table_insert, user_table_insert, song_table_insert, time_table_insert]



In [None]:
#etl.py
import configparser
import psycopg2
from sql_queries import copy_table_queries, insert_table_queries 


def load_staging_tables(cur, conn):
    """
    Load data from S3 to Staging tables
    """
    print('Executing Staging Tables load...')
    for query in copy_table_queries:
        cur.execute(query)
        conn.commit()
    print('Loading into Staging Tables Completed...')

 
def insert_tables(cur, conn):
    """
    Load data from Staging tables into the Warehouse
    """
    print('Populating Warehouse tables...')
    for query in insert_table_queries:
        cur.execute(query)
        conn.commit()
        print('{} ..Success!'.format(query))
    print('/n Warehouse tables populated!')

def main():
    """
    Control function, loading data from sources to targets
    """
    config = configparser.ConfigParser()
    config.read('dwh.cfg')

    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
    cur = conn.cursor()
    
    load_staging_tables(cur, conn)
    insert_tables(cur, conn)

    conn.close()


if __name__ == "__main__":
    main()

In [None]:
#create_tables.py
"""

This function (datawarehouse.ipynb) is called by the "datawarehouse.ipynb" Notebook in the execution of the project.  
The primary purpose is to create all the table objects used in the project.
The queries run are all defined in the "sql_queries.py" file

"""

import configparser
import psycopg2
from sql_queries import drop_table_queries, create_table_queries 



print('"create_tables" running....\n')

def drop_tables(cur, conn):
    """
    Drop all tables in the cluster associated with the project.    
    
    """
    for query in drop_table_queries:
        cur.execute(query)
        conn.commit()
        print('{} ran succesfully...'.format(query))


def create_tables(cur, conn):
    """
    Create all tables in the cluster associated with the project.    
    
    """
    for query in create_table_queries:
        cur.execute(query)
        conn.commit()
        print('{} ran succesfully...\n'.format(query))


def main():
    """
    The main function controls the creation of all the tables used for the project.     
    
    """
    config = configparser.ConfigParser()
    config.read('dwh.cfg')

    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
    cur = conn.cursor()

    drop_tables(cur, conn)
    create_tables(cur, conn)

    conn.close()


if __name__ == "__main__":
    main()