## Data Science 2002 - Anran Zhao (nrb6yu)

#### Import the Necessary Libraries

In [1]:
import os
import json
import numpy
import datetime
import certifi
import pandas as pd

import pymongo
import sqlalchemy
from sqlalchemy import create_engine

#### Declare & Assign Connection Variables for the MySQL Server & Databases with which You'll be Working 

In [2]:
host_name = "localhost"
port = "3306"
user_id = "root"
pwd = "Passw0rd123"

src_dbname = "sakila"
dst_dbname = "sakila2"

#### Define Functions for Getting Data From and Setting Data Into Databases

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

#### Create the New Data Warehouse database, and to Use it, Switch the Connection Context.

In [4]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
sqlEngine.execute(f"USE {dst_dbname};")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2501296f550>

#### Extract Data from the Source Database Tables from SQL

In [5]:
sql_customers = "SELECT * FROM sakila.customer;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-14 23:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-14 23:57:20


In [6]:
sql_films = "SELECT * FROM sakila.film;"
df_films = get_dataframe(user_id, pwd, host_name, src_dbname, sql_films)
df_films.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 00:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 00:03:42


In [7]:
sql_staff = "SELECT * FROM sakila.staff;"
df_staff = get_dataframe(user_id, pwd, host_name, src_dbname, sql_staff)
df_staff.head(2)

Unnamed: 0,staff_id,first_name,last_name,address_id,picture,email,store_id,active,username,password,last_update
0,1,Mike,Hillyer,3,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,Mike.Hillyer@sakilastaff.com,1,1,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-02-14 22:57:16
1,2,Jon,Stephens,4,,Jon.Stephens@sakilastaff.com,2,1,Jon,,2006-02-14 22:57:16


### Create the Date Dimension Table


In [8]:
# Customers
drop_cols = ['active','create_date']
df_customers.drop(drop_cols, axis=1, inplace=True)
df_customers.rename(columns={"id":"customer_id"}, inplace=True)
df_customers.insert(0, "customer_key", range(1, df_customers.shape[0]+1))

df_customers.head(2)

Unnamed: 0,customer_key,customer_id,store_id,first_name,last_name,email,address_id,last_update
0,1,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,2006-02-14 23:57:20
1,2,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,2006-02-14 23:57:20


In [9]:
# Films
drop_cols = ['language_id','original_language_id','replacement_cost','special_features']
df_films.drop(drop_cols, axis=1, inplace=True)
df_films.rename(columns={"id":"film_id"}, inplace=True)
df_films.insert(0, "film_key", range(1, df_films.shape[0]+1))

df_films.head(2)

Unnamed: 0,film_key,film_id,title,description,release_year,rental_duration,rental_rate,length,rating,last_update
0,1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,PG,2006-02-15 00:03:42
1,2,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,G,2006-02-15 00:03:42


In [10]:
# Staff
drop_cols = ['address_id','picture','store_id','active']
df_staff.drop(drop_cols, axis=1, inplace=True)
df_staff.rename(columns={"id":"staff_id"}, inplace=True)
df_staff.insert(0, "staff_key", range(1, df_staff.shape[0]+1))

df_staff.head(2)

Unnamed: 0,staff_key,staff_id,first_name,last_name,email,username,password,last_update
0,1,1,Mike,Hillyer,Mike.Hillyer@sakilastaff.com,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-02-14 22:57:16
1,2,2,Jon,Stephens,Jon.Stephens@sakilastaff.com,Jon,,2006-02-14 22:57:16


#### Load the Transformed DataFrames into the New Data Warehouse by Creating New Tables

In [11]:
db_operation = "insert"

tables = [('dim_customers', df_customers, 'customer_key'),
          ('dim_films', df_films, 'film_key'),
          ('dim_staff', df_staff, 'staff_key')]

In [12]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

#### Declare & Assign Connection Variables for the MongoDB Server, the MySQL Server & Databases with which You'll be Working 

In [13]:
mysql_args = {
    "uid" : "root",
    "pwd" : "Passw0rd123",
    "hostname" : "localhost",
    "dbname" : "sakila2"
}

# The 'cluster_location' must either be "atlas" or "local".
mongodb_args = {
    "user_name" : "nrb6yu",
    "password" : "Passw0rd123!",
    "cluster_name" : "sandbox",
    "cluster_subnet" : "mvvgsd3",
    "cluster_location" : "atlas", # "local"
    "db_name" : "northwind_purchasing"
}

#### Define Functions for Getting Data From and Setting Data Into Databases

In [14]:
def get_sql_dataframe(sql_query, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe
    

def set_dataframe(df, table_name, pk_column, db_operation, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()


def get_mongo_client(**args):
    '''Validate proper input'''
    if args["cluster_location"] not in ['atlas', 'local']:
        raise Exception("You must specify either 'atlas' or 'local' for the cluster_location parameter.")
    
    else:
        if args["cluster_location"] == "atlas":
            connect_str = f"mongodb+srv://{args['user_name']}:{args['password']}@"
            connect_str += f"{args['cluster_name']}.{args['cluster_subnet']}.mongodb.net"
            client = pymongo.MongoClient(connect_str, tlsCAFile=certifi.where())
            
        elif args["cluster_location"] == "local":
            client = pymongo.MongoClient("mongodb://localhost:27017/")
        
    return client


def get_mongo_dataframe(mongo_client, db_name, collection, query):
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = mongo_client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    mongo_client.close()
    
    return dframe


def set_mongo_collections(mongo_client, db_name, data_directory, json_files):
    db = mongo_client[db_name]
    
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(data_directory, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)
        
    mongo_client.close()

#### Populate MongoDB with Source Data

In [15]:
client = get_mongo_client(**mongodb_args)

# Gets the path of the Current Working Directory for this Notebook,
# and then Appends the 'data' directory.
data_dir = os.path.join(os.getcwd(), 'data')

json_files = {"inventory" : 'inventory.json'
             }

set_mongo_collections(client, mongodb_args["db_name"], data_dir, json_files)         

#### Create and Populate the New Dimension Tables from MongoDB

In [16]:
client = get_mongo_client(**mongodb_args)

query = {} # Select all elements (columns), and all documents (rows).
collection = "inventory"

df_inventory = get_mongo_dataframe(client, mongodb_args["db_name"], collection, query)
df_inventory.head(2)

Unnamed: 0,inventory_id,film_id,store_id
0,1,1,1
1,2,1,1


In [17]:
df_inventory.insert(0, "inventory_key", range(1, df_inventory.shape[0]+1))
df_inventory.head(2)

Unnamed: 0,inventory_key,inventory_id,film_id,store_id
0,1,1,1,1
1,2,2,1,1


#### Load the Transformed DataFrames into the New Data Warehouse by Creating New Tables

In [20]:
dataframe = df_customers
table_name = 'dim_customers'
primary_key = 'customer_key'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)

In [21]:
dataframe = df_films
table_name = 'dim_films'
primary_key = 'film_key'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)

In [22]:
dataframe = df_staff
table_name = 'dim_staff'
primary_key = 'staff_key'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)

In [23]:
dataframe = df_inventory
table_name = 'dim_inventory'
primary_key = 'inventory_key'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)

#### Validate that the New Dimension Tables were Created

In [24]:
sql_customers = "SELECT * FROM sakila2.dim_customers;"
df_dim_customers = get_sql_dataframe(sql_customers, **mysql_args)
df_dim_customers.head(2)

Unnamed: 0,customer_key,customer_id,store_id,first_name,last_name,email,address_id,last_update
0,1,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,2006-02-14 23:57:20
1,2,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,2006-02-14 23:57:20


In [25]:
sql_films = "SELECT * FROM sakila2.dim_films;"
df_dim_films = get_sql_dataframe(sql_films, **mysql_args)
df_dim_films.head(2)

Unnamed: 0,film_key,film_id,title,description,release_year,rental_duration,rental_rate,length,rating,last_update
0,1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,PG,2006-02-15 00:03:42
1,2,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,G,2006-02-15 00:03:42


In [26]:
sql_staff = "SELECT * FROM sakila2.dim_staff;"
df_dim_staff = get_sql_dataframe(sql_staff, **mysql_args)
df_dim_staff.head(2)

Unnamed: 0,staff_key,staff_id,first_name,last_name,email,username,password,last_update
0,1,1,Mike,Hillyer,Mike.Hillyer@sakilastaff.com,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-02-14 22:57:16
1,2,2,Jon,Stephens,Jon.Stephens@sakilastaff.com,Jon,,2006-02-14 22:57:16


In [27]:
sql_inventory = "SELECT * FROM sakila2.dim_inventory;"
df_dim_inventory = get_sql_dataframe(sql_inventory, **mysql_args)
df_dim_inventory.head(2)

Unnamed: 0,inventory_key,inventory_id,film_id,store_id
0,1,1,1,1
1,2,2,1,1


## Create and Populate the New Fact Tables

#### Instead, implement the solution using Pandas DataFrames to craft the table

In [28]:
sql_payments = "SELECT * FROM sakila.payment;"
df_payments = get_dataframe(user_id, pwd, host_name, src_dbname, sql_payments)
df_payments.rename(columns={"id":"payment_id"}, inplace=True)
df_payments.head(2)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update
0,1,1,1,76,2.99,2005-05-25 11:30:37,2006-02-15 17:12:30
1,2,1,1,573,0.99,2005-05-28 10:35:23,2006-02-15 17:12:30


In [29]:
sql_inventory = "SELECT * FROM sakila.inventory;"
df_inventory = get_dataframe(user_id, pwd, host_name, src_dbname, sql_inventory)
df_inventory.rename(columns={"id":"inventory_id"}, inplace=True)
df_inventory.head(2)

Unnamed: 0,inventory_id,film_id,store_id,last_update
0,1,1,1,2006-02-15 00:09:17
1,2,1,1,2006-02-15 00:09:17


In [30]:
sql_rentals = "SELECT * FROM sakila.rental;"
df_rentals = get_dataframe(user_id, pwd, host_name, src_dbname, sql_rentals)
df_rentals.rename(columns={"id":"rental_id"}, inplace=True)
df_rentals.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 16:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 16:30:53


#### Get the rental_payment_inventory column

In [31]:
df_rental_payment = pd.merge(df_rentals, df_payments, on='rental_id', how='inner')
df_rental_payment.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id_x,return_date,staff_id_x,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 16:30:53,3504,130,1,2.99,2005-05-24 22:53:30,2006-02-15 17:13:16
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 16:30:53,12377,459,2,2.99,2005-05-24 22:54:33,2006-02-15 17:19:16


In [32]:
df_rental_payment_inventory = pd.merge(df_rental_payment, df_inventory, on='inventory_id', how='inner')
df_rental_payment_inventory.rename(columns={"customer_id_x":"customer_id"}, inplace=True)
df_rental_payment_inventory.drop(['customer_id_y'], axis=1)
df_rental_payment_inventory.rename(columns={"staff_id_x":"staff_id"}, inplace=True)
df_rental_payment_inventory.drop(['staff_id_y'], axis=1)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 16:30:53,3504,130,1,2.99,2005-05-24 22:53:30,2006-02-15 17:13:16,80,1,2006-02-15 00:09:17
1,1577,2005-06-16 04:03:28,367,327,2005-06-24 22:40:28,2,2006-02-15 16:30:53,8828,327,2,3.99,2005-06-16 04:03:28,2006-02-15 17:16:10,80,1,2006-02-15 00:09:17


In [33]:
df_rental_payment_inventory.shape

(16044, 16)

####  Lookup the Primary Keys from the Dimension Tables

In [34]:
sql_customers = "SELECT customer_key, customer_id FROM sakila2.dim_customers;"
df_dim_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_dim_customers.head(2)

Unnamed: 0,customer_key,customer_id
0,1,1
1,2,2


In [35]:
sql_staff = "SELECT staff_key, staff_id FROM sakila2.dim_staff;"
df_dim_staff = get_dataframe(user_id, pwd, host_name, src_dbname, sql_staff)
df_dim_staff.head(2)

Unnamed: 0,staff_key,staff_id
0,1,1
1,2,2


In [36]:
sql_films = "SELECT film_key, film_id FROM sakila2.dim_films;"
df_dim_films = get_dataframe(user_id, pwd, host_name, src_dbname, sql_films)
df_dim_films.head(2)

Unnamed: 0,film_key,film_id
0,1,1
1,2,2


In [37]:
sql_inventory = "SELECT inventory_key, inventory_id FROM sakila2.dim_inventory;"
df_dim_inventory = get_dataframe(user_id, pwd, host_name, src_dbname, sql_inventory)
df_dim_inventory.head(2)

Unnamed: 0,inventory_key,inventory_id
0,1,1
1,2,2


#### Next, using the Business Keys, lookup the corresponding Surrogate Primary Key values in the Dimension tables

In [38]:
df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_customers, on='customer_id', how='right')
df_rental_payment_inventory.drop(['customer_id'], axis=1)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key
0,76,2005-05-25 11:30:37,3021,1,2005-06-03 12:00:37,2,2006-02-15 16:30:53,1,1,1,2.99,2005-05-25 11:30:37,2006-02-15 17:12:30,663,2,2006-02-15 00:09:17,1
1,573,2005-05-28 10:35:23,4020,1,2005-06-03 06:32:23,1,2006-02-15 16:30:53,2,1,1,0.99,2005-05-28 10:35:23,2006-02-15 17:12:30,875,2,2006-02-15 00:09:17,1


In [39]:
df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_staff, on='staff_id', how='right')
df_rental_payment_inventory.drop(['staff_id'], axis=1)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key,staff_key
0,573,2005-05-28 10:35:23,4020,1,2005-06-03 06:32:23,1,2006-02-15 16:30:53,2,1,1,0.99,2005-05-28 10:35:23,2006-02-15 17:12:30,875,2,2006-02-15 00:09:17,1,1
1,7273,2005-07-27 11:31:22,2465,1,2005-07-31 06:50:22,1,2006-02-15 16:30:53,15,1,2,2.99,2005-07-27 11:31:22,2006-02-15 17:12:30,539,1,2006-02-15 00:09:17,1,1


In [40]:
df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_films, on='film_id', how='right')
df_rental_payment_inventory.drop(['film_id'], axis=1)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key,staff_key,film_key
0,10141.0,2005-07-31 22:08:29,8.0,8.0,2005-08-06 16:59:29,1.0,2006-02-15 16:30:53,222.0,8.0,2.0,0.99,2005-07-31 22:08:29,2006-02-15 17:12:31,1,2.0,2006-02-15 00:09:17,8.0,1.0,1
1,12651.0,2005-08-18 18:36:16,8.0,34.0,2005-08-22 22:01:16,1.0,2006-02-15 16:30:53,947.0,34.0,1.0,0.99,2005-08-18 18:36:16,2006-02-15 17:12:37,1,2.0,2006-02-15 00:09:17,34.0,1.0,1


In [41]:
df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_inventory, on='inventory_id', how='right')
df_rental_payment_inventory.drop(['inventory_id'], axis=1)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key,staff_key,film_key,inventory_key
0,14714.0,2005-08-21 21:27:43,1.0,279.0,2005-08-30 22:26:43,1.0,2006-02-15 16:30:53,7578.0,279.0,1.0,3.99,2005-08-21 21:27:43,2006-02-15 17:15:20,1.0,1.0,2006-02-15 00:09:17,279.0,1.0,1.0,1
1,11433.0,2005-08-02 20:13:10,1.0,518.0,2005-08-11 21:35:10,1.0,2006-02-15 16:30:53,13956.0,518.0,2.0,3.99,2005-08-02 20:13:10,2006-02-15 17:20:51,1.0,1.0,2006-02-15 00:09:17,518.0,1.0,1.0,1


#### Lookup the DateKeys from the Date Dimension Table.

In [42]:
sql_dim_date = "SELECT date_key, full_date FROM sakila.dim_date;"
df_dim_date = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_date)
df_dim_date.full_date = df_dim_date.full_date.astype('datetime64[ns]').dt.date
df_dim_date.head(2)

Unnamed: 0,date_key,full_date
0,20000101,2000-01-01
1,20000102,2000-01-02


In [43]:
df_dim_return_date = df_dim_date.rename(columns={"date_key" : "return_date_key", "full_date" : "return_date"})
df_rental_payment_inventory.return_date = df_rental_payment_inventory.return_date.astype('datetime64[ns]').dt.date

df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_return_date, on='return_date', how='left')
df_rental_payment_inventory.drop(['return_date'], axis=1, inplace=True)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key,staff_key,film_key,inventory_key,return_date_key
0,14714.0,2005-08-21 21:27:43,1.0,279.0,1.0,2006-02-15 16:30:53,7578.0,279.0,1.0,3.99,2005-08-21 21:27:43,2006-02-15 17:15:20,1.0,1.0,2006-02-15 00:09:17,279.0,1.0,1.0,1,20050830.0
1,11433.0,2005-08-02 20:13:10,1.0,518.0,1.0,2006-02-15 16:30:53,13956.0,518.0,2.0,3.99,2005-08-02 20:13:10,2006-02-15 17:20:51,1.0,1.0,2006-02-15 00:09:17,518.0,1.0,1.0,1,20050811.0


In [44]:
df_dim_rental_date = df_dim_date.rename(columns={"date_key" : "rental_date_key", "full_date" : "rental_date"})
df_rental_payment_inventory.rental_date = df_rental_payment_inventory.rental_date.astype('datetime64[ns]').dt.date

df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_rental_date, on='rental_date', how='left')
df_rental_payment_inventory.drop(['rental_date'], axis=1, inplace=True)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,inventory_id,customer_id,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,last_update,customer_key,staff_key,film_key,inventory_key,return_date_key,rental_date_key
0,14714.0,1.0,279.0,1.0,2006-02-15 16:30:53,7578.0,279.0,1.0,3.99,2005-08-21 21:27:43,2006-02-15 17:15:20,1.0,1.0,2006-02-15 00:09:17,279.0,1.0,1.0,1,20050830.0,20050821.0
1,11433.0,1.0,518.0,1.0,2006-02-15 16:30:53,13956.0,518.0,2.0,3.99,2005-08-02 20:13:10,2006-02-15 17:20:51,1.0,1.0,2006-02-15 00:09:17,518.0,1.0,1.0,1,20050811.0,20050802.0


In [45]:
df_dim_last_update = df_dim_date.rename(columns={"date_key" : "last_update_key", "full_date" : "last_update"})
df_rental_payment_inventory.last_update = df_rental_payment_inventory.last_update.astype('datetime64[ns]').dt.date

df_rental_payment_inventory = pd.merge(df_rental_payment_inventory, df_dim_last_update, on='last_update', how='left')
df_rental_payment_inventory.drop(['last_update'], axis=1, inplace=True)
df_rental_payment_inventory.head(2)

Unnamed: 0,rental_id,inventory_id,customer_id,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,payment_date,last_update_y,film_id,store_id,customer_key,staff_key,film_key,inventory_key,return_date_key,rental_date_key,last_update_key
0,14714.0,1.0,279.0,1.0,2006-02-15 16:30:53,7578.0,279.0,1.0,3.99,2005-08-21 21:27:43,2006-02-15 17:15:20,1.0,1.0,279.0,1.0,1.0,1,20050830.0,20050821.0,20060215.0
1,11433.0,1.0,518.0,1.0,2006-02-15 16:30:53,13956.0,518.0,2.0,3.99,2005-08-02 20:13:10,2006-02-15 17:20:51,1.0,1.0,518.0,1.0,1.0,1,20050811.0,20050802.0,20060215.0


In [46]:
df_rental_payment_inventory.insert(0, "fact_order_key", range(1, df_rental_payment_inventory.shape[0]+1))
df_rental_payment_inventory.head(2)

Unnamed: 0,fact_order_key,rental_id,inventory_id,customer_id,staff_id,last_update_x,payment_id,customer_id_y,staff_id_y,amount,...,last_update_y,film_id,store_id,customer_key,staff_key,film_key,inventory_key,return_date_key,rental_date_key,last_update_key
0,1,14714.0,1.0,279.0,1.0,2006-02-15 16:30:53,7578.0,279.0,1.0,3.99,...,2006-02-15 17:15:20,1.0,1.0,279.0,1.0,1.0,1,20050830.0,20050821.0,20060215.0
1,2,11433.0,1.0,518.0,1.0,2006-02-15 16:30:53,13956.0,518.0,2.0,3.99,...,2006-02-15 17:20:51,1.0,1.0,518.0,1.0,1.0,1,20050811.0,20050802.0,20060215.0


 #### Write the DataFrame Back to the Database

In [47]:
def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

In [48]:
table_name = "fact_orders"
primary_key = "fact_order_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_rental_payment_inventory, table_name, primary_key, db_operation)

#### Demonstrate that the New Data Warehouse Exists and Contains the Correct Data

In [50]:
sql_test = """
SELECT films.`title` AS `film_name`,
    AVG(films.`rental_rate`) AS `average_rental_rate`,
    AVG(films.`rental_duration`) AS `average_rental_duration`
FROM `sakila2`.`fact_orders` AS orders
INNER JOIN `sakila2`.`dim_films` AS films
ON films.film_id = films.film_key
GROUP BY films.`title`
ORDER BY film_name ASC;
""".format(dst_dbname)

df_test = get_dataframe(user_id, pwd, host_name, src_dbname, sql_test)
df_test.head()

Unnamed: 0,film_name,average_rental_rate,average_rental_duration
0,ACADEMY DINOSAUR,0.99,6.0
1,ACE GOLDFINGER,4.99,3.0
2,ADAPTATION HOLES,2.99,7.0
3,AFFAIR PREJUDICE,2.99,5.0
4,AFRICAN EGG,2.99,6.0
