In [47]:
# using python to perform ETL

In [49]:
!pip install sqlalchemy



In [50]:
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine, text

In [51]:
# Declare & Assign Connection Variables for the MySQL Server & Databases with which You'll be Working
host_name = "localhost"
port = "3306"
user_id = "root"
pwd = "Strawberrylime"

src_dbname = "classicmodels"
dst_dbname = "classicmodels_dw"

In [52]:
# Define Functions for Getting Data From and Setting Data Into Databases
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

In [53]:
# Create the New Data Warehouse database, and to Use it, Switch the Connection Context 
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)
connection = sqlEngine.connect()

connection.execute(text(f"DROP DATABASE IF EXISTS `{dst_dbname}`;"))
connection.execute(text(f"CREATE DATABASE `{dst_dbname}`;"))
connection.execute(text(f"USE {dst_dbname};"))

connection.close()

In [59]:
# fetching data for dimensions table

In [61]:
# fetching data from customers
sql_customers = "SELECT * FROM classicmodels.customers;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customerNumber,customerName,contactLastName,contactFirstName,phone,addressLine1,addressLine2,city,state,postalCode,country,salesRepEmployeeNumber,creditLimit
0,103,Atelier graphique,Schmitt,Carine,40.32.2555,"54, rue Royale",,Nantes,,44000,France,1370.0,21000.0
1,112,Signal Gift Stores,King,Jean,7025551838,8489 Strong St.,,Las Vegas,NV,83030,USA,1166.0,71800.0


In [63]:
# fetching data from products
sql_products = "SELECT * FROM classicmodels.products;"
df_products = get_dataframe(user_id, pwd, host_name, src_dbname, sql_products)
df_products.head(2)

Unnamed: 0,productCode,productName,productLine,productScale,productVendor,productDescription,quantityInStock,buyPrice,MSRP
0,S10_1678,1969 Harley Davidson Ultimate Chopper,Motorcycles,1:10,Min Lin Diecast,"This replica features working kickstand, front...",7933,48.81,95.7
1,S10_1949,1952 Alpine Renault 1300,Classic Cars,1:10,Classic Metal Creations,Turnable front wheels; steering function; deta...,7305,98.58,214.3


In [65]:
# At this point, run the [DS2002 Midterm dim_date SQL file

In [67]:
# Perform any necessary transformations

In [69]:
# CUSTOMERS
# 1. Create a List that enumerates the names of each column you wish to remove (drop) from the Pandas DataFrame
drop_cols = ['contactLastName','contactFirstName','phone','addressLine2','salesRepEmployeeNumber','creditLimit']
df_customers.drop(drop_cols, axis=1, inplace=True)

# 2. Rename the "customerNumber" column to "customer_id" to reflect the entity as it will serve as the business key for lookup operations
df_customers.rename(columns={"customerNumber":"customer_id"}, inplace=True)

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_customers.insert(0, "customer_key", range(1, df_customers.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_customers.head(2)

Unnamed: 0,customer_key,customer_id,customerName,addressLine1,city,state,postalCode,country
0,1,103,Atelier graphique,"54, rue Royale",Nantes,,44000,France
1,2,112,Signal Gift Stores,8489 Strong St.,Las Vegas,NV,83030,USA


In [71]:
# PRODUCTS
# 1. Create a List that enumerates the names of each column you wish to remove (drop) from the Pandas DataFrame
drop_cols = ['productScale','productVendor','productDescription','quantityInStock']
df_products.drop(drop_cols, axis=1, inplace=True)

# 2. Rename the "productCode" column to "product_id" to reflect the entity as it will serve as the business key for lookup operations
df_products.rename(columns={"productCode":"product_id"}, inplace=True)

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_products.insert(0, "product_key", range(1, df_products.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_products.head(2)

Unnamed: 0,product_key,product_id,productName,productLine,buyPrice,MSRP
0,1,S10_1678,1969 Harley Davidson Ultimate Chopper,Motorcycles,48.81,95.7
1,2,S10_1949,1952 Alpine Renault 1300,Classic Cars,98.58,214.3


In [73]:
# load the Transformed DataFrames into the New Data Warehouse by Creating New Tables

In [75]:
db_operation = "insert"

tables = [('dim_customers', df_customers, 'customer_key'),
          ('dim_products', df_products, 'product_key')]

In [77]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

In [79]:
# creating and populating the fact table called sales_fact

In [95]:
sql_sales_fact = """
SELECT 
    o.orderNumber AS order_id,                           
    o.customerNumber AS customer_id,                    
    o.orderDate AS order_date_key,                       
    o.shippedDate AS shipped_date,                       
    p.paymentDate AS payment_date_key,                    
    p.amount AS payment_amount,                          
    o.status AS payment_status                            
FROM classicmodels.orders AS o
LEFT JOIN classicmodels.payments AS p
ON o.customerNumber = p.customerNumber;              
"""

df_sales_fact = get_dataframe(user_id, pwd, host_name, src_dbname, sql_sales_fact)
df_sales_fact.head(2)

Unnamed: 0,order_id,customer_id,order_date_key,shipped_date,payment_date_key,payment_amount,payment_status
0,10100,363,2003-01-06,2003-01-10,2004-11-17,50799.69,Shipped
1,10100,363,2003-01-06,2003-01-10,2003-01-16,10223.83,Shipped


In [103]:
# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_sales_fact.insert(0, "sales_fact_key", range(1, df_sales_fact.shape[0] + 1))
# 4. Display the first 2 rows of the dataframe to validate your work
df_sales_fact.head(2)

Unnamed: 0,sales_fact_key,order_id,customer_id,order_date_key,shipped_date,payment_date_key,payment_amount,payment_status
0,1,10100,363,2003-01-06,2003-01-10,2004-11-17,50799.69,Shipped
1,2,10100,363,2003-01-06,2003-01-10,2003-01-16,10223.83,Shipped


In [107]:
df_sales_fact.head(2)
table_name = "sales_fact"
primary_key = "sales_fact_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_sales_fact, table_name, primary_key, db_operation)

In [43]:
# demonstrate that the new data warehouse exists and contains the correct data