In [1]:
import os 
import duckdb
from dotenv import load_dotenv

In [2]:
# load the env paths 
load_dotenv()

# setting the base path 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [3]:
# setting an in-memory database. 
databse_path = f"{database}/database.db"
ddb = duckdb.connect(databse_path)

In [4]:
# ingest geolocation csv 
ddb.execute(
    """
        CREATE TABLE IF NOT EXISTS dim_geolocation AS
        SELECT 
            uuid() AS GeoLocId,
            *
        FROM 
            read_csv_auto(?);
    """, 
    (f"{base_path}/datasets/olist_geolocation_dataset.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x1057493b0>

In [5]:
# Renaming columns. 
ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_zip_code_prefix TO GeoLocZipCodePrefix;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_lat TO GeoLocLat;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_lng TO GeoLocLng;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_city TO GeoLocCity;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_state TO GeoLocState;
    '''
)

In [7]:
ddb.sql(
    '''
        SELECT *
        FROM dim_geolocation;
    '''
)

┌──────────────────────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┬─────────────┐
│               GeoLocId               │ GeoLocZipCodePrefix │      GeoLocLat      │      GeoLocLng      │ GeoLocCity │ GeoLocState │
│                 uuid                 │       varchar       │       double        │       double        │  varchar   │   varchar   │
├──────────────────────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼────────────┼─────────────┤
│ 00014595-6534-4929-8518-beaa133b92b0 │ 01037               │  -23.54562128115268 │  -46.63929204800168 │ sao paulo  │ SP          │
│ 8589ccd2-af29-4fb4-909c-5ce12c976616 │ 01046               │ -23.546081127035535 │  -46.64482029837157 │ sao paulo  │ SP          │
│ f70af1db-18e0-4091-8b93-25c8bad59b52 │ 01046               │  -23.54612896641469 │  -46.64295148361138 │ sao paulo  │ SP          │
│ 1b8662ba-32ad-4505-9ddb-601da8266e34 │ 01041               │

In [8]:
# Checking the table, clean and ensure its integrity - check null values
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN GeoLocZipCodePrefix IS NULL THEN 1 END) AS GeoLocZipCodePrefix_nulls,
            COUNT(CASE WHEN GeoLocLat IS NULL THEN 1 END) AS GeoLocLat_nulls,
            COUNT(CASE WHEN GeoLocLng IS NULL THEN 1 END) AS GeoLocLng_nulls,
            COUNT(CASE WHEN GeoLocCity IS NULL THEN 1 END) AS GeoLocCity,
            COUNT(CASE WHEN GeoLocState IS NULL THEN 1 END) AS GeoLocState,
        FROM 
            dim_geolocation;
    '''
)

┌────────────┬───────────────────────────┬─────────────────┬─────────────────┬────────────┬─────────────┐
│ total_rows │ GeoLocZipCodePrefix_nulls │ GeoLocLat_nulls │ GeoLocLng_nulls │ GeoLocCity │ GeoLocState │
│   int64    │           int64           │      int64      │      int64      │   int64    │    int64    │
├────────────┼───────────────────────────┼─────────────────┼─────────────────┼────────────┼─────────────┤
│    1000163 │                         0 │               0 │               0 │          0 │           0 │
└────────────┴───────────────────────────┴─────────────────┴─────────────────┴────────────┴─────────────┘

In [9]:
# Checking the table, clean and ensure its integrity - ensuring data quality and integrity
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_geolocation';
    '''
)

┌─────────────────────┬───────────┐
│     column_name     │ data_type │
│       varchar       │  varchar  │
├─────────────────────┼───────────┤
│ GeoLocId            │ UUID      │
│ GeoLocZipCodePrefix │ VARCHAR   │
│ GeoLocLat           │ DOUBLE    │
│ GeoLocLng           │ DOUBLE    │
│ GeoLocCity          │ VARCHAR   │
│ GeoLocState         │ VARCHAR   │
└─────────────────────┴───────────┘

In [10]:
# Checking and changing the datatypes of the dim_customers table to its appropriate data 
# 1. Add a new column with the correct type
ddb.sql('''
    ALTER TABLE dim_geolocation
    ALTER COLUMN GeoLocZipCodePrefix SET DATA TYPE INT;
''')

In [11]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_geolocation.csv")
ddb.execute(
    f"""
        COPY dim_geolocation TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_geolocation.csv


In [12]:
# finally close the database connection. 
ddb.close()