# Project 1: Mid-Term - ETL Processing

#### Import the Necessary Libraries

In [1]:
import os
import json
import numpy
import datetime
import certifi
import pandas as pd

import pymongo
import sqlalchemy
from sqlalchemy import create_engine, text

In [2]:
print(f"Running SQL Alchemy Version: {sqlalchemy.__version__}")
print(f"Running PyMongo Version: {pymongo.__version__}")

Running SQL Alchemy Version: 2.0.38
Running PyMongo Version: 4.11.2


In [3]:
host_name = "localhost"
port = "3306"
user_id = "bhowe"
pwd = "Daufuskie1"

src_dbname = "adventureworks"
dst_dbname = "adventureworks_dw"

# -------------------------------------------------------------------------------------------------------------------------

mysql_args = {
    "uid" : "bhowe",
    "pwd" : "Daufuskie1",
    "hostname" : "localhost",
    "dbname" : "adventureworks_dw"
}

# The 'cluster_location' must either be "atlas" or "local".
mongodb_args = {
    "user_name" : "nkr7fg",
    "password" : "Daufuskie1",
    "cluster_name" : "ds2002",
    "cluster_subnet" : "zsrbu",
    "cluster_location" : "atlas", # "local"
    "db_name" : "adventureworks_sales"
}

In [4]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

# -----------------------------------------------------------------------------------------------------------------------------

def get_sql_dataframe(sql_query, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_sql_dataframe(df, table_name, pk_column, db_operation, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()


def get_mongo_client(**args):
    '''Validate proper input'''
    if args["cluster_location"] not in ['atlas', 'local']:
        raise Exception("You must specify either 'atlas' or 'local' for the cluster_location parameter.")
    
    else:
        if args["cluster_location"] == "atlas":
            connect_str = f"mongodb+srv://{args['user_name']}:{args['password']}@"
            connect_str += f"{args['cluster_name']}.{args['cluster_subnet']}.mongodb.net"
            client = pymongo.MongoClient(connect_str, tlsCAFile=certifi.where())
            
        elif args["cluster_location"] == "local":
            client = pymongo.MongoClient("mongodb://localhost:27017/")
        
    return client


def get_mongo_dataframe(mongo_client, db_name, collection, query):
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = mongo_client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    mongo_client.close()
    
    return dframe


def set_mongo_collections(mongo_client, db_name, data_directory, json_files):
    db = mongo_client[db_name]
    
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(data_directory, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)
        
    mongo_client.close()

In [19]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)
connection = sqlEngine.connect()

connection.execute(text(f"DROP DATABASE IF EXISTS `{dst_dbname}`;"))
connection.execute(text(f"CREATE DATABASE `{dst_dbname}`;"))
connection.execute(text(f"USE {dst_dbname};"))

connection.close()

#### Extract Data from the Source Database Tables and Perform Necessary Transformations

Customers

In [6]:
# Customers
sql_customers = "SELECT * FROM adventureworks.customer;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,CustomerID,TerritoryID,AccountNumber,CustomerType,rowguid,ModifiedDate
0,1,1,AW00000001,S,b'^\xe9Z?}\xb8\xedJ\x95\xb4\xc3yz\xfc\xb7O',2004-10-13 11:15:07
1,2,1,AW00000002,S,b'W\xf6R\xe5\xaf\xa9}J\xa6E\xc4)\xd6\xe0$\x91',2004-10-13 11:15:07


In [7]:
drop_cols = ['rowguid','ModifiedDate']
df_customers.drop(drop_cols, axis=1, inplace=True)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_customers.insert(0, "customer_key", range(1, df_customers.shape[0]+1))

df_customers.head(2)

Unnamed: 0,customer_key,CustomerID,TerritoryID,AccountNumber,CustomerType
0,1,1,1,AW00000001,S
1,2,2,1,AW00000002,S


Products

In [8]:
# Products
sql_products = "SELECT * FROM adventureworks.product;"
df_products = get_dataframe(user_id, pwd, host_name, src_dbname, sql_products)
df_products.head(2)

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
0,1,Adjustable Race,AR-5381,b'\x00',b'\x00',,1000,750,0.0,0.0,...,,,,,,1998-06-01,NaT,,b'\xb7\x15Bi\xf7\x08\rL\xac\xb1\xd74\xbaD\xc0\...,2004-03-11 10:01:36
1,2,Bearing Ball,BA-8327,b'\x00',b'\x00',,1000,750,0.0,0.0,...,,,,,,1998-06-01,NaT,,b' <\xaeX:OIG\xa7\xd4\xd5h\x80l\xc57',2004-03-11 10:01:36


In [9]:
drop_cols = ['MakeFlag','FinishedGoodsFlag','SellEndDate','DiscontinuedDate','SizeUnitMeasureCode','WeightUnitMeasureCode','rowguid','ModifiedDate']
df_products.drop(drop_cols, axis=1, inplace=True)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_products.insert(0, "product_key", range(1, df_products.shape[0]+1))

df_products.head(2)

Unnamed: 0,product_key,ProductID,Name,ProductNumber,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,Size,Weight,DaysToManufacture,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate
0,1,1,Adjustable Race,AR-5381,,1000,750,0.0,0.0,,,0,,,,,,1998-06-01
1,2,2,Bearing Ball,BA-8327,,1000,750,0.0,0.0,,,0,,,,,,1998-06-01


Sales Person

In [10]:
# Sales Person
sql_salesperson = "SELECT * FROM adventureworks.salesperson;"
df_salesperson = get_dataframe(user_id, pwd, host_name, src_dbname, sql_salesperson)
df_salesperson.head(2)

Unnamed: 0,SalesPersonID,TerritoryID,SalesQuota,Bonus,CommissionPct,SalesYTD,SalesLastYear,rowguid,ModifiedDate
0,268,,,0.0,0.0,677558.5,0.0,b'\x92IuH\xe0\x9e\x0eL\x8c\x94\x94Q`N>\x02',2001-01-28
1,275,2.0,300000.0,4100.0,0.012,4557045.0,1750406.0,b'tr\n\x1ed0XO\x88\xeeLe\x86\xc8qi',2001-06-24


In [11]:
drop_cols = ['rowguid','ModifiedDate']
df_salesperson.drop(drop_cols, axis=1, inplace=True)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_salesperson.insert(0, "salesperson_key", range(1, df_salesperson.shape[0]+1))

df_salesperson.head(2)

Unnamed: 0,salesperson_key,SalesPersonID,TerritoryID,SalesQuota,Bonus,CommissionPct,SalesYTD,SalesLastYear
0,1,268,,,0.0,0.0,677558.5,0.0
1,2,275,2.0,300000.0,4100.0,0.012,4557045.0,1750406.0


#### Create the Date Dimension Table
Executed the script from earlier Lab 2c to create and populate a date dimension table (dim_date) that will be extracted from MySQL.

#### Load the Transformed DataFrames into "adventuresworks_dw" by Creating New Tables

# Next Steps

#### Declare & Assign Connection Variables for the MongoDB Server, the MySQL Server & Working Databases

#### Define Functions for Getting Data From and Setting Data Into Databases

Currency Rate

In [12]:
client = get_mongo_client(**mongodb_args)

# Gets the path of the Current Working Directory for this Notebook,
# and then Appends the 'data' directory.
data_dir = os.path.join(os.getcwd(), 'data')

json_files = {"currencyrate" : 'currencyrate.json'
             }

set_mongo_collections(client, mongodb_args["db_name"], data_dir, json_files)    

In [13]:
client = get_mongo_client(**mongodb_args)

query = {} # Select all elements (columns), and all documents (rows).
collection = "currencyrate"

df_currencyrate = get_mongo_dataframe(client, mongodb_args["db_name"], collection, query)
df_currencyrate.head(2)

Unnamed: 0,AverageRate,EndOfDayRate,ModifiedDate,CurrencyRateID,ToCurrencyCode,FromCurrencyCode
0,1.0,1.0002,2001-07-01 00:00:00.000000,1,ARS,USD
1,1.5491,1.55,2001-07-01 00:00:00.000000,2,AUD,USD


In [14]:
drop_cols = ['ModifiedDate']
df_currencyrate.drop(drop_cols, axis=1, inplace=True)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_currencyrate.insert(0, "currencyrate_key", range(1, df_currencyrate.shape[0]+1))

df_currencyrate.head(2)

Unnamed: 0,currencyrate_key,AverageRate,EndOfDayRate,CurrencyRateID,ToCurrencyCode,FromCurrencyCode
0,1,1.0,1.0002,1,ARS,USD
1,2,1.5491,1.55,2,AUD,USD


Ship Method

In [15]:
column_names = ["ShipMethodID", "Name", "ShipBase", "ShipRate", "ModifiedDate"]

df_shipmethod = pd.read_csv(
    "C://Users//puckh//Desktop//Brice Documents//Fourth Year//DS Systems 2002//shipmethod.csv",
    names=column_names, 
    header=None 
)

df_shipmethod.head(2)

Unnamed: 0,ShipMethodID,Name,ShipBase,ShipRate,ModifiedDate
0,1,XRQ - TRUCK GROUND,3.95,0.99,1998-06-01 00:00:00
1,2,ZY - EXPRESS,9.95,1.99,1998-06-01 00:00:00


In [16]:
drop_cols = ['ModifiedDate']
df_shipmethod.drop(drop_cols, axis=1, inplace=True)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_shipmethod.insert(0, "shipmethod_key", range(1, df_shipmethod.shape[0]+1))

# Display the first 2 rows of the dataframe to validate your work
df_shipmethod.head(2)

Unnamed: 0,shipmethod_key,ShipMethodID,Name,ShipBase,ShipRate
0,1,1,XRQ - TRUCK GROUND,3.95,0.99
1,2,2,ZY - EXPRESS,9.95,1.99


#### Load the Transformed DataFrames into "adventuresworks_dw" by Creating New Tables

In [17]:
db_operation = "insert"

tables = [('dim_customers', df_customers, 'customer_key'),
          ('dim_products', df_products, 'product_key'),
          ('dim_salesperson', df_salesperson, 'salesperson_key'),
          ('dim_shipmethod', df_shipmethod, 'shipmethod_key'),
          ('dim_currencyrate', df_currencyrate, 'currencyrate_key')
          ]

In [20]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

#### Create and Populate the Sales Fact Table

In [21]:
sql_sales_od = "SELECT * FROM adventureworks.salesorderdetail;"
df_sales_od = get_dataframe(user_id, pwd, host_name, src_dbname, sql_sales_od)
df_sales_od.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,UnitPriceDiscount,LineTotal,rowguid,ModifiedDate
0,43659,1,4911-403C-98,1,776,1,2024.994,0.0,2024.994,"b'm\xc9\x07\xb2\xe6\xd9+@\x84p,\xc1v\xc4""\x83'",2001-07-01
1,43659,2,4911-403C-98,3,777,1,2024.994,0.0,6074.982,"b'\r`\xbbzw\x1e\xbeA\x9f\xe5\xb9\x14,\xfc\x08\...",2001-07-01


In [24]:
drop_cols = ['rowguid','UnitPriceDiscount','ModifiedDate']
df_sales_od.drop(drop_cols, axis=1, inplace=True)

df_sales_od.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982


In [32]:
sql_sales_oh = "SELECT * FROM adventureworks.salesorderheader;"
df_sales_oh = get_dataframe(user_id, pwd, host_name, src_dbname, sql_sales_oh)
df_sales_oh.head(2)

Unnamed: 0,SalesOrderID,RevisionNumber,OrderDate,DueDate,ShipDate,Status,OnlineOrderFlag,SalesOrderNumber,PurchaseOrderNumber,AccountNumber,...,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,Comment,rowguid,ModifiedDate
0,43659,1,2001-07-01,2001-07-13,2001-07-08,5,b'\x00',SO43659,PO522145787,10-4020-000676,...,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,,b'!S\xb6y\xca9\x15A\x9c\xba\x8f\xe0\x90>\x12\xe6',2001-07-08
1,43660,1,2001-07-01,2001-07-13,2001-07-08,5,b'\x00',SO43660,PO18850127500,10-4020-000117,...,5618.0,115213Vi29411,,1553.1035,124.2483,38.8276,1716.1794,,"b'-\xc4\x8ds;\xd0\xa1H\x98""\xf9Zg\xeas\x89'",2001-07-08


In [33]:
drop_cols = ['rowguid','OnlineOrderFlag','Comment','ModifiedDate']
df_sales_oh.drop(drop_cols, axis=1, inplace=True)

df_sales_oh.head(2)

Unnamed: 0,SalesOrderID,RevisionNumber,OrderDate,DueDate,ShipDate,Status,SalesOrderNumber,PurchaseOrderNumber,AccountNumber,CustomerID,...,BillToAddressID,ShipToAddressID,ShipMethodID,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue
0,43659,1,2001-07-01,2001-07-13,2001-07-08,5,SO43659,PO522145787,10-4020-000676,676,...,985,985,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495
1,43660,1,2001-07-01,2001-07-13,2001-07-08,5,SO43660,PO18850127500,10-4020-000117,117,...,921,921,5,5618.0,115213Vi29411,,1553.1035,124.2483,38.8276,1716.1794


In [34]:
df_fact_sales = pd.merge(df_sales_od, df_sales_oh, on='SalesOrderID', how='inner')
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,OrderDate,...,BillToAddressID,ShipToAddressID,ShipMethodID,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994,1,2001-07-01,...,985,985,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982,1,2001-07-01,...,985,985,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495


Fetch the Primary Key and Business Key from the Date Dimension Table

In [35]:
sql_dim_date = "SELECT date_key, full_date FROM adventureworks_dw.dim_date;"
df_dim_date = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_date)
df_dim_date.full_date = df_dim_date.full_date.astype('datetime64[ns]').dt.date
df_dim_date.head(2)

Unnamed: 0,date_key,full_date
0,20000101,2000-01-01
1,20000102,2000-01-02


In [36]:
# Lookup the Surrogate Primary Key (date_key) that Corresponds to the "order_date" Column.
df_dim_order_date = df_dim_date.rename(columns={"date_key" : "order_date_key", "full_date" : "OrderDate"})
df_fact_sales.OrderDate = df_fact_sales.OrderDate.astype('datetime64[ns]').dt.date

df_fact_sales = pd.merge(df_fact_sales, df_dim_order_date, on='OrderDate', how='left')
df_fact_sales.drop(['OrderDate'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,DueDate,...,ShipToAddressID,ShipMethodID,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,order_date_key
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994,1,2001-07-13,...,985,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982,1,2001-07-13,...,985,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701


In [37]:
df_dim_ship_date = df_dim_date.rename(columns={"date_key" : "ship_date_key", "full_date" : "ShipDate"})
df_fact_sales.ShipDate = df_fact_sales.ShipDate.astype('datetime64[ns]').dt.date

df_fact_sales = pd.merge(df_fact_sales, df_dim_ship_date, on='ShipDate', how='left')
df_fact_sales.drop(['ShipDate'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,DueDate,...,ShipMethodID,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994,1,2001-07-13,...,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982,1,2001-07-13,...,5,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708


In [38]:
df_dim_due_date = df_dim_date.rename(columns={"date_key" : "due_date_key", "full_date" : "DueDate"})
df_fact_sales.DueDate = df_fact_sales.DueDate.astype('datetime64[ns]').dt.date

df_fact_sales = pd.merge(df_fact_sales, df_dim_due_date, on='DueDate', how='left')
df_fact_sales.drop(['DueDate'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,...,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key,due_date_key
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994,1,5,...,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982,1,5,...,16281.0,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713


Fetching the Surrogate Primary Key and the Business Key from each of the remaining Dimension tables

In [39]:
# Select 'customer_key' and 'customer_id' from northwind_dw2.dim_customers
sql_dim_customers = "SELECT customer_key, CustomerID FROM adventureworks_dw.dim_customers;"
df_dim_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_customers)
df_dim_customers.head(2)

Unnamed: 0,customer_key,CustomerID
0,1,1
1,2,2


In [40]:
sql_dim_products = "SELECT product_key, ProductID FROM adventureworks_dw.dim_products;"
df_dim_products = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_products)
df_dim_products.head(2)

Unnamed: 0,product_key,ProductID
0,1,1
1,2,2


In [41]:
sql_dim_salesperson = "SELECT salesperson_key, SalesPersonID FROM adventureworks_dw.dim_salesperson;"
df_dim_salesperson = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_salesperson)
df_dim_salesperson.head(2)

Unnamed: 0,salesperson_key,SalesPersonID
0,1,268
1,2,275


In [42]:
sql_dim_shipmethod = "SELECT shipmethod_key, ShipMethodID FROM adventureworks_dw.dim_shipmethod;"
df_dim_shipmethod = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_shipmethod)
df_dim_shipmethod.head(2)

Unnamed: 0,shipmethod_key,ShipMethodID
0,1,1
1,2,2


In [43]:
sql_dim_currencyrate = "SELECT currencyrate_key, CurrencyRateID FROM adventureworks_dw.dim_currencyrate;"
df_dim_currencyrate = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_currencyrate)
df_dim_currencyrate.head(2)

Unnamed: 0,currencyrate_key,CurrencyRateID
0,1,1
1,2,2


using the Business Keys, lookup the corresponding Surrogate Primary Key values in the Dimension tables

In [44]:
# 1. Modify 'df_fact_orders' by merging it with 'df_dim_customers' on the 'customer_id' column
# 2. Drop the 'customer_id' column
# 3. Display the first 2 rows of the dataframe to validate your work
df_fact_sales = pd.merge(df_fact_sales, df_dim_customers, on='CustomerID', how='left')
df_fact_sales.drop(['CustomerID'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,...,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key
0,43659,1,4911-403C-98,1,776,1,2024.994,2024.994,1,5,...,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676
1,43659,2,4911-403C-98,3,777,1,2024.994,6074.982,1,5,...,105041Vi84182,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676


In [45]:
# 1. Modify 'df_fact_orders' by merging it with 'df_dim_customers' on the 'customer_id' column
# 2. Drop the 'customer_id' column
# 3. Display the first 2 rows of the dataframe to validate your work
df_fact_sales = pd.merge(df_fact_sales, df_dim_products, on='ProductID', how='left')
df_fact_sales.drop(['ProductID'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,SalesOrderNumber,...,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key,product_key
0,43659,1,4911-403C-98,1,1,2024.994,2024.994,1,5,SO43659,...,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,281
1,43659,2,4911-403C-98,3,1,2024.994,6074.982,1,5,SO43659,...,,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,282


In [46]:
# 1. Modify 'df_fact_orders' by merging it with 'df_dim_customers' on the 'customer_id' column
# 2. Drop the 'customer_id' column
# 3. Display the first 2 rows of the dataframe to validate your work
df_fact_sales = pd.merge(df_fact_sales, df_dim_salesperson, on='SalesPersonID', how='left')
df_fact_sales.drop(['SalesPersonID'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,SalesOrderNumber,...,SubTotal,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key,product_key,salesperson_key
0,43659,1,4911-403C-98,1,1,2024.994,2024.994,1,5,SO43659,...,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,281,6.0
1,43659,2,4911-403C-98,3,1,2024.994,6074.982,1,5,SO43659,...,24643.9362,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,282,6.0


In [47]:
# 1. Modify 'df_fact_orders' by merging it with 'df_dim_customers' on the 'customer_id' column
# 2. Drop the 'customer_id' column
# 3. Display the first 2 rows of the dataframe to validate your work
df_fact_sales = pd.merge(df_fact_sales, df_dim_shipmethod, on='ShipMethodID', how='left')
df_fact_sales.drop(['ShipMethodID'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,SalesOrderNumber,...,TaxAmt,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key,product_key,salesperson_key,shipmethod_key
0,43659,1,4911-403C-98,1,1,2024.994,2024.994,1,5,SO43659,...,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,281,6.0,5
1,43659,2,4911-403C-98,3,1,2024.994,6074.982,1,5,SO43659,...,1971.5149,616.0984,27231.5495,20010701,20010708,20010713,676,282,6.0,5


In [48]:
# 1. Modify 'df_fact_orders' by merging it with 'df_dim_customers' on the 'customer_id' column
# 2. Drop the 'customer_id' column
# 3. Display the first 2 rows of the dataframe to validate your work
df_fact_sales = pd.merge(df_fact_sales, df_dim_currencyrate, on='CurrencyRateID', how='left')
df_fact_sales.drop(['CurrencyRateID'], axis=1, inplace=True)
df_fact_sales.head(2)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,SpecialOfferID,UnitPrice,LineTotal,RevisionNumber,Status,SalesOrderNumber,...,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key,product_key,salesperson_key,shipmethod_key,currencyrate_key
0,43659,1,4911-403C-98,1,1,2024.994,2024.994,1,5,SO43659,...,616.0984,27231.5495,20010701,20010708,20010713,676,281,6.0,5,
1,43659,2,4911-403C-98,3,1,2024.994,6074.982,1,5,SO43659,...,616.0984,27231.5495,20010701,20010708,20010713,676,282,6.0,5,


Final Transformations

In [None]:
# 1. Drop the columns of no particular interest
drop_cols = ['SpecialOfferID', 'RevisionNumber',  'SalesOrderNumber', 'CarrierTrackingNumber','PurchaseOrderNumber', 
             'AccountNumber', 'BillToAddressID', 'ShipToAddressID','CreditCardApprovalCode']
df_fact_sales.drop(drop_cols,axis=1,inplace=True)

df_fact_sales.head(2)



Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,UnitPrice,LineTotal,Status,ContactID,TerritoryID,CreditCardID,...,Freight,TotalDue,order_date_key,ship_date_key,due_date_key,customer_key,product_key,salesperson_key,shipmethod_key,currencyrate_key
0,43659,1,4911-403C-98,1,2024.994,2024.994,5,378,5,16281.0,...,616.0984,27231.5495,20010701,20010708,20010713,676,281,6.0,5,
1,43659,2,4911-403C-98,3,2024.994,6074.982,5,378,5,16281.0,...,616.0984,27231.5495,20010701,20010708,20010713,676,282,6.0,5,


In [53]:
# 2. Reorder the remaining columns
order_of_cols = ['SalesOrderID', 'SalesOrderDetailID', 'customer_key', 'product_key', 
                 'salesperson_key', 'shipmethod_key', 'currencyrate_key','order_date_key', 
                 'ship_date_key','due_date_key','ContactID', 'TerritoryID', 'CreditCardID',
                 'OrderQty', 'UnitPrice', 'LineTotal','SubTotal', 'TaxAmt', 
                 'Freight', 'TotalDue','Status']
df_fact_sales = df_fact_sales[order_of_cols]

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_fact_sales.insert(0, 'fact_sales_key', range(1,df_fact_sales.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_fact_sales.head(2)

Unnamed: 0,fact_sales_key,SalesOrderID,SalesOrderDetailID,customer_key,product_key,salesperson_key,shipmethod_key,currencyrate_key,order_date_key,ship_date_key,...,TerritoryID,CreditCardID,OrderQty,UnitPrice,LineTotal,SubTotal,TaxAmt,Freight,TotalDue,Status
0,1,43659,1,676,281,6.0,5,,20010701,20010708,...,5,16281.0,1,2024.994,2024.994,24643.9362,1971.5149,616.0984,27231.5495,5
1,2,43659,2,676,282,6.0,5,,20010701,20010708,...,5,16281.0,3,2024.994,6074.982,24643.9362,1971.5149,616.0984,27231.5495,5


In [54]:
table_name = "fact_sales"
primary_key = "fact_sales_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_fact_sales, table_name, primary_key, db_operation)

#### SQL Queries