In [23]:
import os 
import duckdb 
from dotenv import load_dotenv

In [24]:
# instantiate the load_dotenv to access the env's 
load_dotenv()

# setting up the paths 
base_path = os.getenv('base_path')
database = os.getenv('database_path')

In [26]:
# setting up the database path 
database_path = f"{database}/database.db"

# loading the database file 
ddb = duckdb.connect(database_path)

In [29]:
# creating a view for reviews data. 
ddb.execute(
    f'''
        CREATE VIEW IF NOT EXISTS view_reviews_data AS 
        SELECT *
        FROM read_csv_auto('{base_path}/datasets/olist_order_reviews_dataset.csv');
    '''
)

<duckdb.duckdb.DuckDBPyConnection at 0x10ce19070>

In [31]:
# ingesting reviews data and create is relationships 
ddb.execute(
    '''
        CREATE TABLE IF NOT EXISTS dim_reviews AS 
        SELECT 
            review_id AS ReviewId,
            order_id AS OrderId,
            review_comment_title AS CommentTitle,
            review_comment_message AS ReviewCommentMsg,
            review_creation_date AS ReviewCreationDate,
            review_answer_timestamp AS ReviewAnswerTimestamp,
        FROM 
            view_reviews_data;
    '''
)

<duckdb.duckdb.DuckDBPyConnection at 0x10ce19070>

In [32]:
ddb.sql(
    '''
        SELECT * FROM dim_reviews;
    '''
)

┌──────────────────────────────────┬──────────────────────────────────┬───────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────────────┐
│             ReviewId             │             OrderId              │ CommentTitle  │                                                                                        ReviewCommentMsg                                                                                         │ ReviewCreationDate  │ ReviewAnswerTimestamp │
│             varchar              │             varchar              │    varchar    │                                                                                             varchar                                                                                             │      timestamp      │       timestamp       │
├───────────────

In [33]:
# Checking the data types of every column
ddb.sql(
    '''
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_reviews';
    '''
)

┌───────────────────────┬───────────┐
│      column_name      │ data_type │
│        varchar        │  varchar  │
├───────────────────────┼───────────┤
│ ReviewId              │ VARCHAR   │
│ OrderId               │ VARCHAR   │
│ CommentTitle          │ VARCHAR   │
│ ReviewCommentMsg      │ VARCHAR   │
│ ReviewCreationDate    │ TIMESTAMP │
│ ReviewAnswerTimestamp │ TIMESTAMP │
└───────────────────────┴───────────┘

In [37]:
# Checking null values in dim_reviews table 
ddb.sql(
    '''
        SELECT
            COUNT(*) AS total_rows,
            COUNT(CASE WHEN ReviewId IS NULL THEN 1 END) AS ReviewId,
            COUNT(CASE WHEN OrderId IS NULL THEN 1 END) AS OrderId,
            COUNT(CASE WHEN CommentTitle IS NULL THEN 1 END) AS CommentTitle,
            COUNT(CASE WHEN ReviewCommentMsg IS NULL THEN 1 END) AS ReviewCommentMsg,
            COUNT(CASE WHEN ReviewCreationDate IS NULL THEN 1 END) AS ReviewCreationDate,
            COUNT(CASE WHEN ReviewAnswerTimestamp IS NULL THEN 1 END) AS ReviewAnswerTimestamp,
        FROM 
            dim_reviews;
    '''
)

┌────────────┬──────────┬─────────┬──────────────┬──────────────────┬────────────────────┬───────────────────────┐
│ total_rows │ ReviewId │ OrderId │ CommentTitle │ ReviewCommentMsg │ ReviewCreationDate │ ReviewAnswerTimestamp │
│   int64    │  int64   │  int64  │    int64     │      int64       │       int64        │         int64         │
├────────────┼──────────┼─────────┼──────────────┼──────────────────┼────────────────────┼───────────────────────┤
│      40977 │        0 │       0 │            0 │                0 │                  0 │                     0 │
└────────────┴──────────┴─────────┴──────────────┴──────────────────┴────────────────────┴───────────────────────┘

In [35]:
# Cleaning - Comment message. 
ddb.sql(
    '''
        DELETE FROM dim_reviews 
        WHERE ReviewCommentMsg is NULL;
    '''
)

In [36]:
# Cleaning - Comment title 
ddb.sql(
    '''
        UPDATE dim_reviews
        SET CommentTitle = CASE WHEN CommentTitle IS NULL THEN 'No Title' ELSE CommentTitle END;
    '''
)

In [38]:
# Define the output directory and ensure it exists
output_dir = "/Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics"
output_file = os.path.join(output_dir, "dim_reviews.csv")
ddb.execute(
    f"""
        COPY dim_reviews TO '{output_file}' (FORMAT CSV, HEADER);
    """
)
print(f"Data successfully exported to {output_file}")

Data successfully exported to /Users/macintoshcider/Documents/Programming/Python/ETL/climate-commerce/analytics/dim_reviews.csv


In [39]:
# finally close the connection instance 
ddb.close()