# Building an ETL Pipeline

## Importing Libraries

In [32]:
import pandas as pd # Data Transformation
import pytest       # Makes it easy to write small, readable tests
import os
from dotenv import load_dotenv
import sqlalchemy
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

## [L] Load
* Loading data to Postgres.
    * Open a SQL connection with SQLAlchemy
    * .to_sql()
* Data Quality checks:
    * Validate that data was correctly persisted in postgres
        * Ensure it can be queried
            * pd.read_sql()
        * Make sure counts match
        * Validate each row is present

In [33]:
# Create a Function that created the SQL Engine based on SQLAlchemy
def create_db_engine(connection_uri: str) -> Engine: # Arrow indicates that the function returns an object of type Engine
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    db_engine = create_engine(connection_uri)
    return db_engine

In [34]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

# # Create the database engine
# db_engine = create_db_engine(connection_uri)

# # Close the engine connection
# db_engine.dispose()

In [36]:
def ingest_csv_to_bronze(csv_file, db_url, schema_name, table_name):
    try:
        # Load CSV data into a pandas DataFrame
        df = pd.read_csv(csv_file)
        print("CSV file loaded successfully.")

        # Create a SQLAlchemy engine to connect to the PostgreSQL database
        engine = create_engine(db_url)
        print("Database engine created successfully.")

        # Verify connection and schema existence
        with engine.connect() as connection:
            # Check if the schema exists
            result = connection.execute(
                text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                {"schema": schema_name}
            )
            schema_exists = result.fetchone() is not None
            if not schema_exists:
                raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
            print(f"Schema '{schema_name}' verified to exist.")

            # Drop dependent views
            connection.execute(text("DROP VIEW IF EXISTS churn_gold.aggregated_summary CASCADE;"))
            connection.execute(text("DROP VIEW IF EXISTS churn_silver.customer_summary CASCADE;"))
            print("Dependent views dropped successfully.")

            # Set the search path to the specified schema
            connection.execute(text(f"SET search_path TO {schema_name};"))
            print(f"Search path set to schema '{schema_name}'.")

        # Write the DataFrame to the PostgreSQL table in the specified schema
        df.to_sql(table_name, engine, if_exists='replace', index=False, schema=schema_name)
        print(f"CSV data ingested successfully into {schema_name}.{table_name}.")

        # Recreate dependent views
        # connection.execute(text("""
        #     CREATE VIEW churn_silver.customer_summary AS
        #     SELECT
        #         customer_id,
        #         age,
        #         gender,
        #         total_transactions,
        #         last_purchase_date,
        #         churn_status,
        #         CASE
        #             WHEN total_transactions >= 100 AND last_purchase_date >= '2023-01-01' THEN 'active'
        #             ELSE 'inactive'
        #         END AS customer_status
        #     FROM churn_bronze.customer_data;
        # """))
        # connection.execute(text("""
        #     CREATE VIEW churn_gold.aggregated_summary AS
        #     SELECT
        #         customer_status,
        #         COUNT(*) AS customer_count,
        #         AVG(age) AS avg_age,
        #         AVG(total_transactions) AS avg_transactions
        #     FROM churn_silver.customer_summary
        #     GROUP BY customer_status;
        # """))
        # print("Dependent views recreated successfully.")

    except FileNotFoundError:
        print("Error: CSV file not found.")
    except SQLAlchemyError as e:
        print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
    except ValueError as ve:
        print(str(ve))

# Usage example:
csv_file_path = '/workspace/data/customer_churn.csv'
schema_name = 'churn_bronze'
table_name = 'customer_data'

ingest_csv_to_bronze(csv_file_path, connection_uri, schema_name, table_name)

CSV file loaded successfully.
Database engine created successfully.
Schema 'churn_bronze' verified to exist.
Dependent views dropped successfully.
Search path set to schema 'churn_bronze'.
Error occurred while connecting to the database or ingesting data: (psycopg2.errors.DependentObjectsStillExist) cannot drop table churn_bronze.customer_data because other objects depend on it
DETAIL:  view churn_silver.customer_summary depends on table churn_bronze.customer_data
view churn_gold.aggregated_summary depends on view churn_silver.customer_summary
HINT:  Use DROP ... CASCADE to drop the dependent objects too.

[SQL: 
DROP TABLE churn_bronze.customer_data]
(Background on this error at: https://sqlalche.me/e/20/2j85)


Note:
* In the Load function, we are basically checking if a table named 'crypto_mkt' exists. If it exists, the `'if_exists' = 'replace'`  argument will drop the table before inserting new values.
* If you want something different, please check the [to_sql()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html) documentation.