In [17]:
import os 
import duckdb 
from dotenv import load_dotenv

In [18]:
# instantiate the load_dotenv to load the env files 
load_dotenv()

# setup the paths needed 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [19]:
# setup the database specific file
database_path = f"{database}/database.db"

# load the database 
ddb = duckdb.connect(database_path)

In [20]:
# ingest the payments data and form its relationships if any 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_payments AS 
        SELECT 
            *
        FROM 
            read_csv_auto(?);
    ''',
    (f"{base_path}/datasets/olist_order_payments_dataset.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x10e5437f0>

In [26]:
ddb.sql(
    '''
        SELECT *
        FROM dim_payments;
    '''
)

┌──────────────────────────────────┬───────────────────┬─────────────┬─────────────────────┬─────────┐
│             OrderId              │ PaymentSequential │ PaymentType │ PaymentInstallments │ Payment │
│             varchar              │       int64       │   varchar   │        int64        │ double  │
├──────────────────────────────────┼───────────────────┼─────────────┼─────────────────────┼─────────┤
│ b81ef226f3fe1789b1e8b2acac839d17 │                 1 │ credit_card │                   8 │   99.33 │
│ a9810da82917af2d9aefd1278f1dcfa0 │                 1 │ credit_card │                   1 │   24.39 │
│ 25e8ea4e93396b6fa0d3dd708e76c1bd │                 1 │ credit_card │                   1 │   65.71 │
│ ba78997921bbcdc1373bb41e913ab953 │                 1 │ credit_card │                   8 │  107.78 │
│ 42fdf880ba16b47b59251dd489d4441a │                 1 │ credit_card │                   2 │  128.45 │
│ 298fcdf1f73eb413e4d26d01b25bc1cd │                 1 │ credit_card │   

In [31]:
# renaming columns of the dim_payments to an appropriate column names 
ddb.sql(
    '''
        ALTER TABLE dim_payments 
        RENAME order_id TO OrderId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_payments 
        RENAME payment_sequential TO PaymentSequential;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_payments 
        RENAME payment_type TO PaymentType;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_payments 
        RENAME payment_installments TO PaymentInstallments;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_payments 
        RENAME payment_value TO PaymentValue;
    '''
)

In [35]:
# checking the datatypes of every column in dim_payments
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_payments';
    '''
)

┌─────────────────────┬───────────┐
│     column_name     │ data_type │
│       varchar       │  varchar  │
├─────────────────────┼───────────┤
│ OrderId             │ VARCHAR   │
│ PaymentSequential   │ INTEGER   │
│ PaymentType         │ VARCHAR   │
│ PaymentInstallments │ INTEGER   │
│ PaymentValue        │ DOUBLE    │
└─────────────────────┴───────────┘

In [33]:
# Checking null values in dim_payments table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN OrderId IS NULL THEN 1 END) AS OrderId,
            COUNT(CASE WHEN PaymentSequential IS NULL THEN 1 END) AS PaymentSequential,
            COUNT(CASE WHEN PaymentType IS NULL THEN 1 END) AS PaymentType,
            COUNT(CASE WHEN PaymentInstallments IS NULL THEN 1 END) AS PaymentInstallments,
            COUNT(CASE WHEN PaymentValue IS NULL THEN 1 END) AS PaymentValue,
        FROM 
            dim_payments;
    '''
)

┌────────────┬─────────┬───────────────────┬─────────────┬─────────────────────┬──────────────┐
│ total_rows │ OrderId │ PaymentSequential │ PaymentType │ PaymentInstallments │ PaymentValue │
│   int64    │  int64  │       int64       │    int64    │        int64        │    int64     │
├────────────┼─────────┼───────────────────┼─────────────┼─────────────────────┼──────────────┤
│     103886 │       0 │                 0 │           0 │                   0 │            0 │
└────────────┴─────────┴───────────────────┴─────────────┴─────────────────────┴──────────────┘

In [34]:
# changing the datatype of payment sequential and payment installments 
ddb.sql(
    '''
        ALTER TABLE dim_payments 
        ALTER COLUMN PaymentSequential SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_payments 
        ALTER COLUMN PaymentInstallments SET DATA TYPE INT;
    '''
)

In [36]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_payments.csv")
ddb.execute(
    f"""
        COPY dim_payments TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_payments.csv


In [37]:
# Finally close the connection instance 
ddb.close()