In [1]:
import os 
import duckdb 
from dotenv import load_dotenv

In [3]:
# instanstiating dotenv to access all env's 
load_dotenv()

# setting up the paths needed 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [4]:
# setting up the database path 
database_path = f"{database}/database.db"

# connecting to the database file 
ddb = duckdb.connect(database_path)

In [5]:
# ingesting the dim_products with its translation, create is relationships also 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_products AS
        SELECT 
            p.*,
            t.product_category_name_english
        FROM 
            read_csv_auto(?) p
        JOIN
            read_csv_auto(?) t
        ON 
            p.product_category_name = t.product_category_name;
    ''',
    (f"{base_path}/datasets/olist_products_dataset.csv", f"{base_path}/datasets/product_category_name_translation.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x103ea23f0>

In [11]:
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_products';
    '''
)

┌─────────────────────┬───────────┐
│     column_name     │ data_type │
│       varchar       │  varchar  │
├─────────────────────┼───────────┤
│ ProductId           │ VARCHAR   │
│ CategoryName        │ VARCHAR   │
│ NameLength          │ BIGINT    │
│ DescriptionLength   │ BIGINT    │
│ ProductPhotosQty    │ BIGINT    │
│ ProductWeightInG    │ BIGINT    │
│ ProductLengthCM     │ BIGINT    │
│ ProductHeightCM     │ BIGINT    │
│ ProductWidthCM      │ BIGINT    │
│ CategoryNameEnglish │ VARCHAR   │
├─────────────────────┴───────────┤
│ 10 rows               2 columns │
└─────────────────────────────────┘

In [9]:
# renaming the column names in the dim_products. 
ddb.sql(
    '''
        ALTER TABLE dim_products
        RENAME product_id TO ProductId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_category_name TO CategoryName;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_name_lenght TO NameLength;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_description_lenght TO DescriptionLength;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_photos_qty TO ProductPhotosQty;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_weight_g TO ProductWeightInG;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_length_cm TO ProductLengthCM;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_height_cm TO ProductHeightCM;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_width_cm TO ProductWidthCM;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        RENAME product_category_name_english TO CategoryNameEnglish;
    '''
)

In [23]:
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_products';
    '''
)

┌─────────────────────┬───────────┐
│     column_name     │ data_type │
│       varchar       │  varchar  │
├─────────────────────┼───────────┤
│ ProductId           │ VARCHAR   │
│ CategoryName        │ VARCHAR   │
│ NameLength          │ INTEGER   │
│ DescriptionLength   │ INTEGER   │
│ ProductPhotosQty    │ INTEGER   │
│ ProductWeightInG    │ INTEGER   │
│ ProductLengthCM     │ INTEGER   │
│ ProductHeightCM     │ INTEGER   │
│ ProductWidthCM      │ INTEGER   │
│ CategoryNameEnglish │ VARCHAR   │
├─────────────────────┴───────────┤
│ 10 rows               2 columns │
└─────────────────────────────────┘

In [21]:
# Checking null values in dim_products table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN ProductId IS NULL THEN 1 END) AS ProductId,
            COUNT(CASE WHEN CategoryName IS NULL THEN 1 END) AS CategoryName,
            COUNT(CASE WHEN NameLength IS NULL THEN 1 END) AS NameLength,
            COUNT(CASE WHEN DescriptionLength IS NULL THEN 1 END) AS DescriptionLength,
            COUNT(CASE WHEN ProductPhotosQty IS NULL THEN 1 END) AS ProductPhotosQty,
            COUNT(CASE WHEN ProductWeightInG IS NULL THEN 1 END) AS ProductWeightInG,
            COUNT(CASE WHEN ProductLengthCM IS NULL THEN 1 END) AS ProductLengthCM,
            COUNT(CASE WHEN ProductHeightCM IS NULL THEN 1 END) AS ProductHeightCM,
            COUNT(CASE WHEN ProductWidthCM IS NULL THEN 1 END) AS ProductWidthCM,
            COUNT(CASE WHEN CategoryNameEnglish IS NULL THEN 1 END) AS CategoryNameEnglish,
            
        FROM 
            dim_products;
    '''
)

┌────────────┬───────────┬──────────────┬────────────┬───────────────────┬──────────────────┬──────────────────┬─────────────────┬─────────────────┬────────────────┬─────────────────────┐
│ total_rows │ ProductId │ CategoryName │ NameLength │ DescriptionLength │ ProductPhotosQty │ ProductWeightInG │ ProductLengthCM │ ProductHeightCM │ ProductWidthCM │ CategoryNameEnglish │
│   int64    │   int64   │    int64     │   int64    │       int64       │      int64       │      int64       │      int64      │      int64      │     int64      │        int64        │
├────────────┼───────────┼──────────────┼────────────┼───────────────────┼──────────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼─────────────────────┤
│      32328 │         0 │            0 │          0 │                 0 │                0 │                0 │               0 │               0 │              0 │                   0 │
└────────────┴───────────┴──────────────┴────────────┴──────

In [20]:
# cleaning null values - product weight in grams and product length in cm 
ddb.sql(
    '''
        UPDATE dim_products
        SET 
            ProductWeightInG = -1,
            ProductLengthCM = -1,
            ProductHeightCM = -1,
            ProductWidthCM = -1,
        WHERE 
            ProductWeightInG is NULL OR ProductLengthCM is NULL OR ProductHeightCM is NULL OR ProductWidthCM is NULL;
    '''
)
# setting them up to negative 1 to indicate that the value on the specific column is not given. 

In [22]:
# changing the numeric columns datatypes 
ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN NameLength SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN DescriptionLength SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN ProductPhotosQty SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN ProductWeightInG SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN ProductLengthCM SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN ProductHeightCM SET DATA TYPE INT;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_products 
        ALTER COLUMN ProductWidthCM SET DATA TYPE INT;
    '''
)

In [24]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_products.csv")
ddb.execute(
    f"""
        COPY dim_products TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_products.csv


In [26]:
# finally close the connection instance 
ddb.close()