In [1]:
import os 
import duckdb 
from dotenv import load_dotenv

In [4]:
# loading the env 
load_dotenv()

# setting up the paths 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [6]:
# setting up the in memory database 
database_path = f"{database}/database.db"

# connecting to database file 
ddb = duckdb.connect(database_path)

In [9]:
# create table dim_seller and create all of its relationships 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_sellers AS 
        SELECT 
            s.*, 
            g.GeoLocId
        FROM
            read_csv_auto(?) s 
        JOIN 
            dim_geolocation g 
        ON
            s.seller_zip_code_prefix = g.GeoLocZipCodePrefix
        AND 
            s.seller_city = g.GeoLocCity
        AND 
            s.seller_state = g.GeoLocState;
    ''',
    (f"{base_path}/datasets/olist_sellers_dataset.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x107953cf0>

In [12]:
# renaming tables in dim_seller 
ddb.sql(
    '''
        ALTER TABLE dim_sellers 
        RENAME seller_id TO SellerId;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_sellers 
        RENAME seller_zip_code_prefix TO SellerZipCodePrefix;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_sellers 
        RENAME seller_city TO SellerCity;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_sellers 
        RENAME seller_state TO SellerState;
    '''
)

In [13]:
ddb.sql(
    '''
        SELECT *
        FROM dim_sellers;
    '''
)

┌──────────────────────────────────┬─────────────────────┬────────────┬─────────────┬──────────────────────────────────────┐
│             SellerId             │ SellerZipCodePrefix │ SellerCity │ SellerState │               GeoLocId               │
│             varchar              │       varchar       │  varchar   │   varchar   │                 uuid                 │
├──────────────────────────────────┼─────────────────────┼────────────┼─────────────┼──────────────────────────────────────┤
│ ad420dd0c4f92f8af951ac24b86d0cf5 │ 38230               │ fronteira  │ MG          │ 9340b6d0-57ff-4b38-b868-21b1594196ca │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │ 38230               │ fronteira  │ MG          │ 42998436-ee2d-4150-8a79-7dadb2860784 │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │ 38230               │ fronteira  │ MG          │ cb1c466d-178a-4ce1-adcc-9a2a7b21a68f │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │ 38230               │ fronteira  │ MG          │ c0b136dc-6a7a-4706-b4bf-9f668596a8a0 │


In [15]:
# Checking the data types of every column
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_sellers';
    '''
)

┌─────────────────────┬───────────┐
│     column_name     │ data_type │
│       varchar       │  varchar  │
├─────────────────────┼───────────┤
│ SellerId            │ VARCHAR   │
│ SellerZipCodePrefix │ VARCHAR   │
│ SellerCity          │ VARCHAR   │
│ SellerState         │ VARCHAR   │
│ GeoLocId            │ UUID      │
└─────────────────────┴───────────┘

In [17]:
# Checking null values in dim_sellers table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN SellerId IS NULL THEN 1 END) AS SellerId,
            COUNT(CASE WHEN SellerZipCodePrefix IS NULL THEN 1 END) AS SellerZipCodePrefix,
            COUNT(CASE WHEN SellerCity IS NULL THEN 1 END) AS SellerCity,
            COUNT(CASE WHEN SellerState IS NULL THEN 1 END) AS SellerState,
            COUNT(CASE WHEN GeoLocId IS NULL THEN 1 END) AS GeoLocId,
        FROM 
            dim_sellers;
    '''
)

┌────────────┬──────────┬─────────────────────┬────────────┬─────────────┬──────────┐
│ total_rows │ SellerId │ SellerZipCodePrefix │ SellerCity │ SellerState │ GeoLocId │
│   int64    │  int64   │        int64        │   int64    │    int64    │  int64   │
├────────────┼──────────┼─────────────────────┼────────────┼─────────────┼──────────┤
│     385584 │        0 │                   0 │          0 │           0 │        0 │
└────────────┴──────────┴─────────────────────┴────────────┴─────────────┴──────────┘

In [18]:
# Checking the datatyoes of the dim_sellers table 
ddb.sql(
    '''
        SELECT 
            CAST(SellerId AS VARCHAR) AS SellerId,
            CAST(SellerZipCodePrefix AS INT) AS SellerZipCodePrefix,
            CAST(SellerCity AS VARCHAR) AS SellerCity,
            CAST(SellerState AS VARCHAR) AS SellerState,
            CAST(GeoLocId AS UUID) AS GeoLocId,
        FROM dim_sellers;
    '''
)

┌──────────────────────────────────┬─────────────────────┬────────────┬─────────────┬──────────────────────────────────────┐
│             SellerId             │ SellerZipCodePrefix │ SellerCity │ SellerState │               GeoLocId               │
│             varchar              │        int32        │  varchar   │   varchar   │                 uuid                 │
├──────────────────────────────────┼─────────────────────┼────────────┼─────────────┼──────────────────────────────────────┤
│ ad420dd0c4f92f8af951ac24b86d0cf5 │               38230 │ fronteira  │ MG          │ 9340b6d0-57ff-4b38-b868-21b1594196ca │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │               38230 │ fronteira  │ MG          │ 42998436-ee2d-4150-8a79-7dadb2860784 │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │               38230 │ fronteira  │ MG          │ cb1c466d-178a-4ce1-adcc-9a2a7b21a68f │
│ ad420dd0c4f92f8af951ac24b86d0cf5 │               38230 │ fronteira  │ MG          │ c0b136dc-6a7a-4706-b4bf-9f668596a8a0 │


In [19]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_sellers.csv")
ddb.execute(
    f"""
        COPY dim_sellers TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_sellers.csv


In [20]:
# finally closing the duckdb connection
ddb.close()