In [107]:
# using python to perform ETL

In [109]:
%%python -m pip install pymongo
!pip install sqlalchemy



In [110]:
import os
import json
import numpy
import datetime
import certifi
import pandas as pd

import pymongo
import sqlalchemy
from sqlalchemy import create_engine, text

In [112]:
print(f"Running SQL Alchemy Version: {sqlalchemy.__version__}")
print(f"Running PyMongo Version: {pymongo.__version__}")

Running SQL Alchemy Version: 2.0.34
Running PyMongo Version: 4.11.2


In [113]:
# MongoDB: Declare and Assign Connection Variables for the MongoDB Server 
mysql_args = {
    "uid" : "root",
    "pwd" : "Strawberrylime",
    "hostname" : "localhost",
    "dbname" : "classicmodels_dw"
}
conn_str = f"mysql+pymysql://{mysql_args['uid']}:{mysql_args['pwd']}@{mysql_args['hostname']}/{mysql_args['dbname']}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

# The 'cluster_location' must either be "atlas" or "local".
mongodb_args = {
    "user_name" : "aryaa_desh",
    "password" : "Strawberrylime",
    "cluster_name" : "Cluster0",
    "cluster_subnet" : "xxxxx",
    "cluster_location" : "local", # "local"
    "db_name" : "classicmodels_dw" ## Why does this say "northwind_purchasing"?
}

In [115]:
# Declare & Assign Connection Variables for the MySQL Server & Databases with which You'll be Working
host_name = "localhost"
port = "3306"
user_id = "root"
pwd = "Strawberrylime"

src_dbname = "classicmodels"
dst_dbname = "classicmodels_dw"

In [120]:
# Define Functions for Getting Data From and Setting Data into Database for MongoDB
def get_sql_dataframe(sql_query, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    dframe = pd.read_sql(text(sql_query), connection);
    connection.close()
    
    return dframe
    

# def set_dataframe(df, table_name, pk_column, db_operation, **args):
#     '''Create a connection to the MySQL database'''
#     conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
#     sqlEngine = create_engine(conn_str, pool_recycle=3600)
#     connection = sqlEngine.connect()
    
#     '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
#     if db_operation == "insert":
#         df.to_sql(table_name, con=connection, index=False, if_exists='append')
#         connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
#     elif db_operation == "update":
#         df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
#     connection.close()

def get_mongo_client(**args):
    '''Validate proper input'''
    if args["cluster_location"] not in ['atlas', 'local']:
        raise Exception("You must specify either 'atlas' or 'local' for the cluster_location parameter.")
    
    else:
        if args["cluster_location"] == "atlas":
            connect_str = f"mongodb+srv://{args['user_name']}:{args['password']}@"
            connect_str += f"{args['cluster_name']}.{args['cluster_subnet']}.mongodb.net"
            client = pymongo.MongoClient(connect_str, tlsCAFile=certifi.where())
            
        elif args["cluster_location"] == "local":
            client = pymongo.MongoClient("mongodb://localhost:27017/")
        
    return client


def get_mongo_dataframe(mongo_client, db_name, collection, query):
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = mongo_client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    mongo_client.close()
    
    return dframe


def set_mongo_collections(mongo_client, db_name, data_directory, json_files):
    db = mongo_client[db_name]
    
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(data_directory, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)
        
    mongo_client.close()


In [122]:
# Define CSV File Path (Ensure the file is in the correct directory)
data_dir = os.getcwd()  # Adjust if necessary
data_file = os.path.join(data_dir, "orders.csv")

# Read CSV into Pandas DataFrame
df = pd.read_csv(data_file)

# Load Data into MySQL
table_name = "orders"  # Ensure this matches your MySQL table name
df.to_sql(name=table_name, con=sqlEngine, if_exists="replace", index=False)

print(f"Data successfully inserted into `{table_name}` table in `{mysql_args['dbname']}` database.")

Data successfully inserted into `orders` table in `classicmodels_dw` database.


In [124]:
# Populate MongoDB with Source Data

In [126]:
!nslookup cluster_name.xxxxx.mongodb.net

Server:		128.143.2.7
Address:	128.143.2.7#53

** server can't find cluster_name.xxxxx.mongodb.net: NXDOMAIN



In [128]:
client = get_mongo_client(**mongodb_args)

# Gets the path of the Current Working Directory for this Notebook,
# and then Appends the 'data' directory.
data_dir = os.path.join(os.getcwd(), 'data')
print(f"Current working directory: {os.getcwd()}")

json_files = {"payments" : 'payments.json'}

set_mongo_collections(client, mongodb_args["db_name"], data_dir, json_files)      

Current working directory: /Users/aryaadeshpande/Desktop


In [130]:

# Extract Data from the Source MongoDV Collections into DataFrames
client = pymongo.MongoClient("mongodb://localhost:27017/")  # Adjust the URI as needed
db = client["db_name"]

client = get_mongo_client(**mongodb_args)

query = {} # Select all elements (columns), and all documents (rows).
collection = "payments"

df_payments = get_mongo_dataframe(client, mongodb_args["db_name"], collection, query)
df_payments.head(2)

Unnamed: 0,customerNumber,checkNumber,paymentDate,amount
0,103,HQ336336,2004-10-19,6066.78
1,103,JM555205,2003-06-05,14571.44


In [132]:
# At this point, run the [DS2002 Midterm dim_date SQL file]

In [135]:
# Look up the payments_date keys from the date dimension table
sql_dim_date = "SELECT date_key, full_date FROM classicmodels_dw.dim_date;"
df_dim_date = get_sql_dataframe(sql_dim_date, **mysql_args)
df_dim_date.full_date = df_dim_date.full_date.astype('datetime64[ns]').dt.date
df_dim_date.head(10)


Unnamed: 0,date_key,full_date
0,20000101,2000-01-01
1,20000102,2000-01-02
2,20000103,2000-01-03
3,20000104,2000-01-04
4,20000105,2000-01-05
5,20000106,2000-01-06
6,20000107,2000-01-07
7,20000108,2000-01-08
8,20000109,2000-01-09
9,20000110,2000-01-10


In [137]:
# Look up the Surrogate Primary Key (date_key) that corresponds to the paymentDate column

df_dim_paymentDate = df_dim_date.rename(columns={"date_key" : "paymentDate_key", "full_date" : "paymentDate"})
df_payments.paymentDate = df_payments.paymentDate.astype('datetime64[ns]').dt.date
df_payments = pd.merge(df_payments, df_dim_paymentDate, on='paymentDate', how='left')
df_payments.drop(['paymentDate'], axis=1, inplace=True)
df_payments.head(2)

Unnamed: 0,customerNumber,checkNumber,amount,paymentDate_key
0,103,HQ336336,6066.78,20041019
1,103,JM555205,14571.44,20030605


In [139]:
# Perform Any Necessary Transformations to the DataFrames

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_payments.drop(['checkNumber'], axis=1, inplace=True)
df_payments.insert(0, "payment_key", range(1, df_payments.shape[0]+1))
df_payments.head(2)

Unnamed: 0,payment_key,customerNumber,amount,paymentDate_key
0,1,103,6066.78,20041019
1,2,103,14571.44,20030605


In [141]:
# Load the transformed payments DataFrames into the New Data Warehouse by creating new tables

In [144]:
# # Define Functions for Getting Data From and Setting Data Into Databases
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


# def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
#     conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
#     sqlEngine = create_engine(conn_str, pool_recycle=3600)
#     connection = sqlEngine.connect()
    
#     if db_operation == "insert":
#         df.to_sql(table_name, con=connection, index=False, if_exists='replace')
#         connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
#     elif db_operation == "update":
#         df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
#     connection.close()

def set_dataframe(df, table_name, pk_column, db_operation, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()

    # Debugging the query and connection
    print(f"Inserting into {table_name} with primary key {pk_column}")

    # Safely insert or update data
    if db_operation == "insert":
        try:
            df.to_sql(table_name, con=connection, index=False, if_exists='append')  # Append data without replacing
            print("Data inserted successfully.")
        except Exception as e:
            print(f"Error inserting data into {table_name}: {e}")

        # Ensure the primary key column exists with AUTO_INCREMENT before inserting
        try:
            # Check if the column exists
            result = connection.execute(f"SHOW COLUMNS FROM {table_name} LIKE '{pk_column}';")
            if not result.fetchone():
                print(f"Adding primary key column {pk_column} as AUTO_INCREMENT.")
                # Add primary key if it doesn't exist
                connection.execute(f"ALTER TABLE {table_name} ADD COLUMN {pk_column} INT AUTO_INCREMENT PRIMARY KEY;")
        except Exception as e:
            print(f"Error adding primary key to {table_name}: {e}")

    elif db_operation == "replace":
        try:
            df.to_sql(table_name, con=connection, index=False, if_exists='replace')  # ⚠️ Use this carefully!
            connection.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            print("Data replaced successfully.")
        except Exception as e:
            print(f"Error replacing data into {table_name}: {e}")

    connection.close()


In [146]:
# TODO: Upload the "payments" dataframe to create the new "dim_payments" dimension table
# Check for duplicates in the payment_key column

dataframe = df_payments
table_name = 'dim_payments'
primary_key = 'payment_key'
db_operation = "update"

set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)

Inserting into dim_payments with primary key payment_key


In [148]:
# fetching data for dimensions table

In [150]:
# fetching data from customers
sql_customers = "SELECT * FROM classicmodels.customers;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customerNumber,customerName,contactLastName,contactFirstName,phone,addressLine1,addressLine2,city,state,postalCode,country,salesRepEmployeeNumber,creditLimit
0,103,Atelier graphique,Schmitt,Carine,40.32.2555,"54, rue Royale",,Nantes,,44000,France,1370.0,21000.0
1,112,Signal Gift Stores,King,Jean,7025551838,8489 Strong St.,,Las Vegas,NV,83030,USA,1166.0,71800.0


In [152]:
# fetching data from orders
sql_orders = "SELECT * FROM classicmodels_dw.orders"
df_orders = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_orders)
df_orders.head(2)

Unnamed: 0,OrderID,OrderDate,RequiredDate,ShippedDate,Status,Comments,CustomerID
0,10100,2003-01-06,2003-01-13,2003-01-10,Shipped,,363
1,10101,2003-01-09,2003-01-18,2003-01-11,Shipped,Check on availability.,128


In [154]:
# fetching data from products
sql_products = "SELECT * FROM classicmodels.products;"
df_products = get_dataframe(user_id, pwd, host_name, src_dbname, sql_products)
df_products.head(2)

Unnamed: 0,productCode,productName,productLine,productScale,productVendor,productDescription,quantityInStock,buyPrice,MSRP
0,S10_1678,1969 Harley Davidson Ultimate Chopper,Motorcycles,1:10,Min Lin Diecast,"This replica features working kickstand, front...",7933,48.81,95.7
1,S10_1949,1952 Alpine Renault 1300,Classic Cars,1:10,Classic Metal Creations,Turnable front wheels; steering function; deta...,7305,98.58,214.3


In [156]:
# Perform any necessary transformations

In [158]:
# CUSTOMERS
# 1. Create a List that enumerates the names of each column you wish to remove (drop) from the Pandas DataFrame
drop_cols = ['contactLastName','contactFirstName','phone','addressLine2','salesRepEmployeeNumber','creditLimit']
df_customers.drop(drop_cols, axis=1, inplace=True)

# 2. Rename the "customerNumber" column to "customer_id" to reflect the entity as it will serve as the business key for lookup operations
df_customers.rename(columns={"customerNumber":"customer_id"}, inplace=True)

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_customers.insert(0, "customer_key", range(1, df_customers.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_customers.head(2)

Unnamed: 0,customer_key,customer_id,customerName,addressLine1,city,state,postalCode,country
0,1,103,Atelier graphique,"54, rue Royale",Nantes,,44000,France
1,2,112,Signal Gift Stores,8489 Strong St.,Las Vegas,NV,83030,USA


In [160]:
# PRODUCTS
# 1. Create a List that enumerates the names of each column you wish to remove (drop) from the Pandas DataFrame
drop_cols = ['productScale','productVendor','productDescription','quantityInStock']
df_products.drop(drop_cols, axis=1, inplace=True)

# 2. Rename the "productCode" column to "product_id" to reflect the entity as it will serve as the business key for lookup operations
df_products.rename(columns={"productCode":"product_id"}, inplace=True)

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_products.insert(0, "product_key", range(1, df_products.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_products.head(2)

Unnamed: 0,product_key,product_id,productName,productLine,buyPrice,MSRP
0,1,S10_1678,1969 Harley Davidson Ultimate Chopper,Motorcycles,48.81,95.7
1,2,S10_1949,1952 Alpine Renault 1300,Classic Cars,98.58,214.3


In [162]:
sql_check_table = "SHOW TABLES;"
df_tables = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_check_table)
print(df_tables)

  Tables_in_classicmodels_dw
0              dim_customers
1                   dim_date
2               dim_payments
3               dim_products
4                     orders
5                 sales_fact


In [164]:
# load the Transformed DataFrames into the New Data Warehouse by Creating New Tables

In [166]:
db_operation = "update"

tables = [('dim_customers', df_customers, 'customer_key'),
          ('dim_products', df_products, 'product_key')]

In [168]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(dataframe, table_name, primary_key, db_operation, **mysql_args)


Inserting into dim_customers with primary key customer_key
Inserting into dim_products with primary key product_key


In [170]:
# creating and populating the fact table called sales_fact

In [172]:
sql_check_table = "SHOW TABLES;"
df_tables = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_check_table)
print(df_tables)

  Tables_in_classicmodels_dw
0              dim_customers
1                   dim_date
2               dim_payments
3               dim_products
4                     orders
5                 sales_fact


In [174]:
sql_sales_fact = """
SELECT 
    c.customer_key, 
    py.payment_key, 
    py.amount AS total_price,
    py.paymentDate_key
FROM dim_payments py
JOIN dim_customers c ON py.customerNumber = c.customer_key


"""

# sql_sales_fact = """
# SELECT 
#     c.customer_key, 
#     py.payment_key, 
#     py.amount AS total_price,
#     py.paymentDate_key,
#     sf.product_key,
#     p.productName  -- Optionally include the product name from dim_products
# FROM dim_payments py
# JOIN dim_customers c ON py.customerNumber = c.customer_key
# JOIN sales_fact sf ON sf.payment_key = py.payment_key
# LEFT JOIN dim_products p ON sf.product_key = p.product_key;
# """

df_sales_fact = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_sales_fact)
print(df_sales_fact.shape)  # Should be a non-zero shape if data exists
print(df_sales_fact.head(2))  # Check the first 2 rows to ensure data is fetched


(17, 4)
   customer_key  payment_key  total_price  paymentDate_key
0           103            1      6066.78         20041019
1           103            2     14571.44         20030605


In [176]:
# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_sales_fact.insert(0, "sales_fact_key", range(1, df_sales_fact.shape[0] + 1))
# df_sales_fact.drop(columns=["sales_fact_key"], errors="ignore", inplace=True)

# 4. Display the first 2 rows of the dataframe to validate your work
df_sales_fact.head(2)

Unnamed: 0,sales_fact_key,customer_key,payment_key,total_price,paymentDate_key
0,1,103,1,6066.78,20041019
1,2,103,2,14571.44,20030605


In [178]:
# Insert data into the sales_fact table

df_sales_fact.to_sql('sales_fact', con=sqlEngine, if_exists='replace', index=False)


print("Data successfully inserted into the sales_fact table.")
df_sales_fact.head(5)

Data successfully inserted into the sales_fact table.


Unnamed: 0,sales_fact_key,customer_key,payment_key,total_price,paymentDate_key
0,1,103,1,6066.78,20041019
1,2,103,2,14571.44,20030605
2,3,103,3,1676.14,20041218
3,4,112,4,14191.12,20041217
4,5,112,5,32641.98,20030606


In [180]:
# demonstrate that the new data warehouse exists and contains the correct data

In [183]:
#This query retrieves total sales data for each customer, including total transactions, revenue, and average payment amount.

sql_query = """
SELECT 
    c.customer_key, 
    c.customerName, 
    COUNT(sf.sales_fact_key) AS total_sales, 
    SUM(py.amount) AS total_revenue, 
    AVG(py.amount) AS avg_payment_amount
FROM sales_fact sf
JOIN dim_payments py ON sf.payment_key = py.payment_key
JOIN dim_customers c ON py.customerNumber = c.customer_key
GROUP BY c.customer_key, c.customerName
ORDER BY total_revenue DESC;
"""

# Execute the SQL query and load the results into a Pandas DataFrame
df_total_sales_by_customer = pd.read_sql(sql_query, sqlEngine)

# Display the first 2 rows to check the output
print(df_total_sales_by_customer.head(2))

   customer_key              customerName  total_sales  total_revenue  \
0           114       Mit Vergnügen & Co.            4      180585.07   
1           119  Signal Collectibles Ltd.            3      116949.68   

   avg_payment_amount  
0        45146.267500  
1        38983.226667  
