Using Python to Perform Extract-Transform-Load (ETL Processing)
    Import Necessary Libraries:

In [1]:
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine

#### Declare & Assign Connection Variables for the MySQL Server & Databases with which You'll be Working 

In [2]:
host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"
user_id = "burrelllizzie"
pwd = "178Chandler"

src_dbname = "sakila"
dst_dbname = "sakila_dw"

#### Define Functions for Getting Data From and Setting Data Into Databases

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

Create the New Data Warehouse database, and to Use it, Switch the Connection Context.

In [4]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
sqlEngine.execute(f"USE {dst_dbname};")

<sqlalchemy.engine.result.ResultProxy at 0x18c2e3a3cd0>

Create and Populate the Dimension Tables

In [5]:
sql_customer = "SELECT * FROM sakila.customer;"
df_customer = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customer)
df_customer.head(2)

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-15 04:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-15 04:57:20


In [6]:
sql_staff = "SELECT * FROM sakila.staff;"
df_staff = get_dataframe(user_id, pwd, host_name, src_dbname, sql_staff)
df_staff.head(2)

Unnamed: 0,staff_id,first_name,last_name,address_id,picture,email,store_id,active,username,password,last_update
0,1,Mike,Hillyer,3,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,Mike.Hillyer@sakilastaff.com,1,1,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-02-15 03:57:16
1,2,Jon,Stephens,4,,Jon.Stephens@sakilastaff.com,2,1,Jon,,2006-02-15 03:57:16


In [7]:
sql_film = "SELECT * FROM sakila.film;"
df_film = get_dataframe(user_id, pwd, host_name, src_dbname, sql_film)
df_film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [8]:
sql_store = "SELECT * FROM sakila.store;"
df_store = get_dataframe(user_id, pwd, host_name, src_dbname, sql_store)
df_store.head(2)

Unnamed: 0,store_id,manager_staff_id,address_id,last_update
0,1,1,1,2006-02-15 04:57:12
1,2,2,2,2006-02-15 04:57:12


Perform any necessary transformations

In [9]:
drop_cols = ['email','active','create_date','last_update']
df_customer.drop(drop_cols, axis=1, inplace=True)
df_customer.rename(columns={"customer_id":"customer_key"}, inplace=True)

df_customer.head(2)

Unnamed: 0,customer_key,store_id,first_name,last_name,address_id
0,1,1,MARY,SMITH,5
1,2,1,PATRICIA,JOHNSON,6


In [10]:
df_store.rename(columns={"store_id":"store_key"}, inplace=True)

df_store.head(2)

Unnamed: 0,store_key,manager_staff_id,address_id,last_update
0,1,1,1,2006-02-15 04:57:12
1,2,2,2,2006-02-15 04:57:12


In [11]:
drop_cols = ['picture','email','username','password']
df_staff.drop(drop_cols, axis=1, inplace=True)
df_staff.rename(columns={"staff_id":"staff_key"}, inplace=True)

df_staff.head(2)

Unnamed: 0,staff_key,first_name,last_name,address_id,store_id,active,last_update
0,1,Mike,Hillyer,3,1,1,2006-02-15 03:57:16
1,2,Jon,Stephens,4,2,1,2006-02-15 03:57:16


In [12]:
drop_cols = ['release_year','description','special_features', 'language_id', 'original_language_id']
df_film.drop(drop_cols, axis=1, inplace=True)
df_film.rename(columns={"film_id":"film_key"}, inplace=True)

df_film.head(2)

Unnamed: 0,film_key,title,rental_duration,rental_rate,length,replacement_cost,rating,last_update
0,1,ACADEMY DINOSAUR,6,0.99,86,20.99,PG,2006-02-15 05:03:42
1,2,ACE GOLDFINGER,3,4.99,48,12.99,G,2006-02-15 05:03:42


Loading dataframes into new data warehouse:

In [13]:
db_operation = "insert"

tables = [('dim_customer', df_customer, 'customer_key'),
          ('dim_staff', df_staff, 'staff_key'),
          ('dim_film', df_film, 'film_key'),
          ('dim_store', df_store, 'store_key')]

In [14]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

Creating and Populating the Fact Table:
    Using a SQL select statement

In [15]:
sql_fact_rentals = """
   SELECT r.rental_id,
    r.rental_date,
	r.inventory_id,
    r.customer_id,
    r.return_date,
    r.staff_id,
    r.last_update, 

    p.amount,
    p.payment_date

FROM sakila.rental as r
INNER JOIN sakila.payment as p
ON r.rental_id = p.rental_id;

"""

df_fact_rentals = get_dataframe(user_id, pwd, host_name, src_dbname, sql_fact_rentals)
df_fact_rentals.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update,amount,payment_date
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53,2.99,2005-05-24 22:53:30
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53,2.99,2005-05-24 22:54:33


Creating a fact table using the Pandas Dataframe:
    Getting data from both tables involved

In [17]:
sql_rentals = "SELECT * FROM sakila.rental;"
df_rentals = get_dataframe(user_id, pwd, host_name, src_dbname, sql_rentals)
df_rentals.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53


In [18]:
sql_payment = "SELECT * FROM sakila.payment;"
df_payment = get_dataframe(user_id, pwd, host_name, src_dbname, sql_payment)
df_payment.head(2)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update
0,1,1,1,76,2.99,2005-05-25 11:30:37,2006-02-15 22:12:30
1,2,1,1,573,0.99,2005-05-28 10:35:23,2006-02-15 22:12:30


Combining the rentals and payment tables

In [21]:
df_rental = pd.merge(df_rentals, df_payment, on='rental_id', how='inner')
df_rental.drop(['last_update'], axis=1, inplace=True)
df_rental.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id_x,return_date,staff_id_x,last_update_x,payment_id_x,customer_id_y,staff_id_y,amount_x,payment_date_x,last_update_y,payment_id_y,customer_id,staff_id,amount_y,payment_date_y
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53,3504,130,1,2.99,2005-05-24 22:53:30,2006-02-15 22:13:16,3504,130,1,2.99,2005-05-24 22:53:30
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53,12377,459,2,2.99,2005-05-24 22:54:33,2006-02-15 22:19:16,12377,459,2,2.99,2005-05-24 22:54:33


##### 2.2.5. Perform any Additional Transformations
In this step we can prepare the DataFrame so that it defines exactly what we want to see created in the database.  Issues may include dropping unwanted columns, reordering the columns, and in our case, creating a new column to serve as the primary key.

In [22]:
drop_columns = ['last_update','inventory_id']
df_fact_rentals.drop(drop_columns, axis=1, inplace=True)

# Reorder the columns

ordered_columns = ['rental_id','rental_date','r.inventory_id','r.customer_id','r.return_date',
                    'r.staff_id','r.last_update', 'p.amount','p.payment_date']

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_fact_rentals.insert(0, "rental_key", range(1, df_fact_rentals.shape[0]+1))
df_fact_rentals.head(2)

Unnamed: 0,rental_key,rental_id,rental_date,customer_id,return_date,staff_id,amount,payment_date
0,1,1,2005-05-24 22:53:30,130,2005-05-26 22:04:30,1,2.99,2005-05-24 22:53:30
1,2,2,2005-05-24 22:54:33,459,2005-05-28 19:40:33,1,2.99,2005-05-24 22:54:33


##### 2.2.6. Write the DataFrame Back to the Database

In [25]:
table_name = "fact_rentals"
primary_key = "rental_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_fact_rentals, table_name, primary_key, db_operation)

Demonstrate that the New Data Warehouse Exists and Contains the Correct Data
    Want the statement to return: Customer’s Last Name and Total amount of payment associated with each customer

In [45]:
sql_test = """
    SELECT customers.`last_name` AS `customer_name`,
         SUM(rentals.`amount`) AS `total_rental_payment`
        FROM `sakila_dw`.`fact_rentals` AS rentals
        INNER JOIN `sakila_dw`.`dim_customer` AS customers
        ON rentals.customer_id = customers.customer_key
        GROUP BY customers.`last_name`
        ORDER BY total_rental_payment DESC;
    """
df_test = get_dataframe(user_id, pwd, host_name, src_dbname, sql_test)

In [46]:
df_test.head()

Unnamed: 0,customer_name,total_rental_payment
0,SEAL,221.55
1,HUNT,216.54
2,SHAW,195.58
3,KENNEDY,194.61
4,SNYDER,194.61


Want this statement to return the amount of payment for rentals facilitated by each staff member

In [49]:
sql_test = """
    SELECT staff.`last_name` AS `staff_name`,
        SUM(rentals.`amount`) AS `total_rental_payment`
        FROM `sakila_dw`.`fact_rentals` AS rentals
        INNER JOIN `sakila_dw`.`dim_staff` AS staff
        ON rentals.staff_id = staff.staff_key
        GROUP BY staff.`last_name`
        ORDER BY total_rental_payment DESC;
    """

df_test = get_dataframe(user_id, pwd, host_name, src_dbname, sql_test)

In [50]:
df_test.head()

Unnamed: 0,staff_name,total_rental_payment
0,Stephens,33881.94
1,Hillyer,33524.62
