In [1]:
import os 
import duckdb 
from dotenv import load_dotenv

In [2]:
# instantiating the dotenv to access all env's 
load_dotenv()

# setting up the paths needed 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [3]:
# setting up the database path 
database_path = f"{database}/database.db"

# load the database file 
ddb = duckdb.connect(database_path)

In [4]:
# creatiing a view for order data 
ddb.execute(
    f'''
        CREATE VIEW IF NOT EXISTS view_orders_data AS 
        SELECT *
        FROM read_csv_auto('{base_path}/datasets/olist_orders_dataset.csv');
    '''
)

<duckdb.duckdb.DuckDBPyConnection at 0x105ea5eb0>

In [5]:
# ingesting or creating the dim_orders and its relationships 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_orders AS 
        SELECT 
            order_id,
            order_status,
            order_approved_at,
            order_delivered_carrier_date,
            order_delivered_customer_date,
            order_estimated_delivery_date
        FROM view_orders_data;
    '''
)

<duckdb.duckdb.DuckDBPyConnection at 0x105ea5eb0>

In [6]:
# renaming the columns
ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_id TO OrderId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_status TO OrderStatus;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_approved_at TO ApprovedAt;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_delivered_carrier_date TO DeliveredCarrierDate;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_delivered_customer_date TO DeliveredCustomerDate;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_orders 
        RENAME order_estimated_delivery_date TO EstimatedDelieryDate;
    '''
)

In [7]:
# Checking data types of every column
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_orders';
    '''
)

┌───────────────────────┬───────────┐
│      column_name      │ data_type │
│        varchar        │  varchar  │
├───────────────────────┼───────────┤
│ OrderId               │ VARCHAR   │
│ OrderStatus           │ VARCHAR   │
│ ApprovedAt            │ TIMESTAMP │
│ DeliveredCarrierDate  │ TIMESTAMP │
│ DeliveredCustomerDate │ TIMESTAMP │
│ EstimatedDelieryDate  │ TIMESTAMP │
└───────────────────────┴───────────┘

In [8]:
# Checking null values in dim_orders table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN OrderId IS NULL THEN 1 END) AS OrderId,
            COUNT(CASE WHEN OrderStatus IS NULL THEN 1 END) AS OrderStatus,
            COUNT(CASE WHEN ApprovedAt IS NULL THEN 1 END) AS ApprovedAt,
            COUNT(CASE WHEN DeliveredCarrierDate IS NULL THEN 1 END) AS DeliveredCarrierDate,
            COUNT(CASE WHEN DeliveredCustomerDate IS NULL THEN 1 END) AS DeliveredCustomerDate,
            COUNT(CASE WHEN EstimatedDelieryDate IS NULL THEN 1 END) AS EstimatedDelieryDate,
        FROM 
            dim_orders;
    '''
)

┌────────────┬─────────┬─────────────┬────────────┬──────────────────────┬───────────────────────┬──────────────────────┐
│ total_rows │ OrderId │ OrderStatus │ ApprovedAt │ DeliveredCarrierDate │ DeliveredCustomerDate │ EstimatedDelieryDate │
│   int64    │  int64  │    int64    │   int64    │        int64         │         int64         │        int64         │
├────────────┼─────────┼─────────────┼────────────┼──────────────────────┼───────────────────────┼──────────────────────┤
│      99441 │       0 │           0 │        160 │                 1783 │                  2965 │                    0 │
└────────────┴─────────┴─────────────┴────────────┴──────────────────────┴───────────────────────┴──────────────────────┘

In [10]:
# will retain nulls as it holds meaning to the specific columns where it became null.

In [11]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_orders.csv")
ddb.execute(
    f"""
        COPY dim_orders TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_orders.csv


In [12]:
# finally close the connection instance 
ddb.close()