In [1]:
import os 
import duckdb
from dotenv import load_dotenv

In [2]:
# loading the env's 
load_dotenv()

# setting up the paths 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [3]:
# setting up the database path 
database_path = f"{database}/database.db"

# setting up the connection to the database file 
ddb = duckdb.connect(database_path)

In [4]:
# ingesting customer csv table and create its relationship to other tables 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_customers AS 
        SELECT 
            c.*,
            g.GeoLocId
        FROM 
            read_csv_auto(?) c
        JOIN
            dim_geolocation g
        ON
            c.customer_zip_code_prefix = g.GeoLocZipCodePrefix
        AND 
            c.customer_city = g.GeoLocCity
        AND 
            c.customer_state = g.GeoLocState;
    ''',
    (f"{base_path}/datasets/olist_customers_dataset.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x107a6ea70>

In [5]:
ddb.sql('''
    SELECT 
        *
    FROM dim_customers;
''')

┌──────────────────────────────────┬──────────────────────────────────┬──────────────────────────┬───────────────┬────────────────┬──────────────────────────────────────┐
│           customer_id            │        customer_unique_id        │ customer_zip_code_prefix │ customer_city │ customer_state │               GeoLocId               │
│             varchar              │             varchar              │         varchar          │    varchar    │    varchar     │                 uuid                 │
├──────────────────────────────────┼──────────────────────────────────┼──────────────────────────┼───────────────┼────────────────┼──────────────────────────────────────┤
│ 6519f59a7687a4661f7c8a7e24f28f94 │ 324ce23e08a1768007118a11836ea7d3 │ 01037                    │ sao paulo     │ SP             │ 00014595-6534-4929-8518-beaa133b92b0 │
│ f253f58c3e8f21a0997f6332ecc0c01d │ 9e5663921908dcd140cf657688ef6a0b │ 01046                    │ sao paulo     │ SP             │ 8589ccd2-af29

In [6]:
# renaming tables in dim_customer 
ddb.sql(
    '''
        ALTER TABLE dim_customers 
        RENAME customer_id to CustomerId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_customers 
        RENAME customer_unique_id to CustomerUniqueId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_customers 
        RENAME customer_zip_code_prefix to CustomerZipCodePrefix;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_customers 
        RENAME customer_city to CustomerCity;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_customers 
        RENAME customer_state to CustomerState;
    '''
)

In [11]:
# Checking data types of every column
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_customers';
    '''
)

┌───────────────────────┬───────────┐
│      column_name      │ data_type │
│        varchar        │  varchar  │
├───────────────────────┼───────────┤
│ CustomerId            │ VARCHAR   │
│ CustomerUniqueId      │ VARCHAR   │
│ CustomerZipCodePrefix │ INTEGER   │
│ CustomerCity          │ VARCHAR   │
│ CustomerState         │ VARCHAR   │
│ GeoLocId              │ UUID      │
└───────────────────────┴───────────┘

In [8]:
# Checking null values in dim_customers table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN CustomerId IS NULL THEN 1 END) AS CustomerId,
            COUNT(CASE WHEN CustomerUniqueId IS NULL THEN 1 END) AS CustomerUniqueId,
            COUNT(CASE WHEN CustomerZipCodePrefix IS NULL THEN 1 END) AS CustomerZipCodePrefix,
            COUNT(CASE WHEN CustomerCity IS NULL THEN 1 END) AS CustomerCity,
            COUNT(CASE WHEN CustomerState IS NULL THEN 1 END) AS CustomerState,
            COUNT(CASE WHEN GeoLocId IS NULL THEN 1 END) AS GeoLocId,
        FROM 
            dim_customers;
    '''
)

┌────────────┬────────────┬──────────────────┬───────────────────────┬──────────────┬───────────────┬──────────┐
│ total_rows │ CustomerId │ CustomerUniqueId │ CustomerZipCodePrefix │ CustomerCity │ CustomerState │ GeoLocId │
│   int64    │   int64    │      int64       │         int64         │    int64     │     int64     │  int64   │
├────────────┼────────────┼──────────────────┼───────────────────────┼──────────────┼───────────────┼──────────┤
│   14129363 │          0 │                0 │                     0 │            0 │             0 │        0 │
└────────────┴────────────┴──────────────────┴───────────────────────┴──────────────┴───────────────┴──────────┘

In [10]:
# Checking and changing the datatypes of the dim_customers table to its appropriate data 
# 1. Add a new column with the correct type
ddb.sql('''
    ALTER TABLE dim_customers
    ALTER COLUMN CustomerZipCodePrefix SET DATA TYPE INT;
''')

In [12]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_customers.csv")
ddb.execute(
    f"""
        COPY dim_customers TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_customers.csv


In [13]:
# finally close the duckdb connection
ddb.close()