In [12]:
import os 
import duckdb
from dotenv import load_dotenv

In [13]:
# load the env paths 
load_dotenv()

# setting the base path 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [14]:
# setting an in-memory database. 
databse_path = f"{database}/database.db"
ddb = duckdb.connect(databse_path)

In [16]:
# ingest geolocation csv 
ddb.execute(
    """
        CREATE TABLE IF NOT EXISTS dim_geolocation AS
        SELECT 
            uuid() AS GeoLocId,
            *
        FROM 
            read_csv_auto(?);
    """, 
    (f"{base_path}/datasets/olist_geolocation_dataset.csv",)
)

<duckdb.duckdb.DuckDBPyConnection at 0x1089bfb30>

In [17]:
# Renaming columns. 
ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_zip_code_prefix TO GeoLocZipCodePrefix;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_lat TO GeoLocLat;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_lng TO GeoLocLng;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_city TO GeoLocCity;
    '''
)

ddb.sql(
    '''
        ALTER TABLE dim_geolocation 
        RENAME geolocation_state TO GeoLocState;
    '''
)

In [6]:
ddb.sql(
    '''
        SELECT *
        FROM geoloc_table;
    '''
)

┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┬─────────────┐
│ GeoLocZipCodePrefix │      GeoLocLat      │      GeoLocLng      │ GeoLocCity │ GeoLocState │
│       varchar       │       double        │       double        │  varchar   │   varchar   │
├─────────────────────┼─────────────────────┼─────────────────────┼────────────┼─────────────┤
│ 01037               │  -23.54562128115268 │  -46.63929204800168 │ sao paulo  │ SP          │
│ 01046               │ -23.546081127035535 │  -46.64482029837157 │ sao paulo  │ SP          │
│ 01046               │  -23.54612896641469 │  -46.64295148361138 │ sao paulo  │ SP          │
│ 01041               │   -23.5443921648681 │  -46.63949930627844 │ sao paulo  │ SP          │
│ 01035               │ -23.541577961711493 │  -46.64160722329613 │ sao paulo  │ SP          │
│ 01012               │ -23.547762303364266 │  -46.63536053788448 │ são paulo  │ SP          │
│ 01047               │ -23.546273112412678 │  -46

In [20]:
# Checking the table, clean and ensure its integrity - check null values
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN GeoLocZipCodePrefix IS NULL THEN 1 END) AS GeoLocZipCodePrefix_nulls,
            COUNT(CASE WHEN GeoLocLat IS NULL THEN 1 END) AS GeoLocLat_nulls,
            COUNT(CASE WHEN GeoLocLng IS NULL THEN 1 END) AS GeoLocLng_nulls,
            COUNT(CASE WHEN GeoLocCity IS NULL THEN 1 END) AS GeoLocCity,
            COUNT(CASE WHEN GeoLocState IS NULL THEN 1 END) AS GeoLocState,
        FROM 
            dim_geolocation;
    '''
)

┌────────────┬───────────────────────────┬─────────────────┬─────────────────┬────────────┬─────────────┐
│ total_rows │ GeoLocZipCodePrefix_nulls │ GeoLocLat_nulls │ GeoLocLng_nulls │ GeoLocCity │ GeoLocState │
│   int64    │           int64           │      int64      │      int64      │   int64    │    int64    │
├────────────┼───────────────────────────┼─────────────────┼─────────────────┼────────────┼─────────────┤
│    1000163 │                         0 │               0 │               0 │          0 │           0 │
└────────────┴───────────────────────────┴─────────────────┴─────────────────┴────────────┴─────────────┘

In [22]:
# Checking the table, clean and ensure its integrity - ensuring data quality and integrity
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_geolocation';
    '''
)

ddb.sql(
    '''
        SELECT 
            CAST(GeoLocZipCodePrefix AS INT) AS GeoLocZipCodePrefix,
            CAST(GeoLocLat AS DOUBLE) AS GeoLocLat,
            CAST(GeoLocLng AS DOUBLE) AS GeoLocLng,
            CAST(GeoLocCity AS VARCHAR) AS GeoLocCity,
            CAST(GeoLocState AS VARCHAR) AS GeoLocState,
        FROM dim_geolocation;
    '''
)

┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┬─────────────┐
│ GeoLocZipCodePrefix │      GeoLocLat      │      GeoLocLng      │ GeoLocCity │ GeoLocState │
│        int32        │       double        │       double        │  varchar   │   varchar   │
├─────────────────────┼─────────────────────┼─────────────────────┼────────────┼─────────────┤
│                1037 │  -23.54562128115268 │  -46.63929204800168 │ sao paulo  │ SP          │
│                1046 │ -23.546081127035535 │  -46.64482029837157 │ sao paulo  │ SP          │
│                1046 │  -23.54612896641469 │  -46.64295148361138 │ sao paulo  │ SP          │
│                1041 │   -23.5443921648681 │  -46.63949930627844 │ sao paulo  │ SP          │
│                1035 │ -23.541577961711493 │  -46.64160722329613 │ sao paulo  │ SP          │
│                1012 │ -23.547762303364266 │  -46.63536053788448 │ são paulo  │ SP          │
│                1047 │ -23.546273112412678 │  -46

In [24]:
ddb.sql(
    '''
        SELECT *
        FROM dim_geolocation;
    '''
)

┌──────────────────────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┬─────────────┐
│               GeoLocId               │ GeoLocZipCodePrefix │      GeoLocLat      │      GeoLocLng      │ GeoLocCity │ GeoLocState │
│                 uuid                 │       varchar       │       double        │       double        │  varchar   │   varchar   │
├──────────────────────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼────────────┼─────────────┤
│ 23f1e31a-7457-4c8c-8d0a-63a8a77111f4 │ 01037               │  -23.54562128115268 │  -46.63929204800168 │ sao paulo  │ SP          │
│ 2eae17cc-a488-4329-826d-1ab76f2f6a68 │ 01046               │ -23.546081127035535 │  -46.64482029837157 │ sao paulo  │ SP          │
│ d10d7988-1ffb-456e-a915-0f4cd07e691f │ 01046               │  -23.54612896641469 │  -46.64295148361138 │ sao paulo  │ SP          │
│ 8f0ed4f0-1fa3-4c42-8dff-0841f1672c7e │ 01041               │

In [26]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_geolocation.csv")
ddb.execute(
    f"""
        COPY dim_geolocation TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_geolocation.csv


In [27]:
# finally close the database connection. 
ddb.close()