# ETL Pipeline

## Importing Libraries

In [92]:
import pandas as pd # Data Transformation
from datetime import datetime
import os
from subprocess import call
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

## Setting Up PostgreSQL Connection

### Loading Environmental Variables

In [93]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
port = os.getenv('POSTGRES_PORT')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}:{port}/{db_name}"
connection_uri_2 = f"postgresql://{user}:{password}@localhost:5432/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")
else:
    print(f"The connection URI from within the Docker Container is: {connection_uri}")
    print(f"The connection URI for the local machine is: {connection_uri_2}")

The connection URI from within the Docker Container is: postgresql://myuser:mypassword@postgres:5432/mydatabase
The connection URI for the local machine is: postgresql://myuser:mypassword@localhost:5432/mydatabase


## Creating Schemas, Tables, and Views in PostgreSQL

### Creating a PostgreSQL Connection Engine with SQLAlchemy

In [94]:
# Define function to create an SQLAlchemy engine
def create_db_engine(connection_uri: str):
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    try:
        db_engine = create_engine(connection_uri)
        print("Database engine created successfully.")
    except SQLAlchemyError as e:
        print(f"Error occurred while creating the database engine: {str(e)}")
        return None
    # Log or handle the error as needed
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None
    
    return db_engine

### Creating Schemas in PostgreSQL

In [95]:
# Function to run SQL script using shell command
# I had to pass the env parameters explicitly  to the subprocess.call() -> (PGPASSWORD, PGUSER, PGHOST, PGPORT, PGDATABASE)
# This avoided Jupyter Notebook asking for password. 
def run_sql_script(script_name):
    script_path = f"/workspace/sql_scripts/{script_name}"
    print(f"{user}:{password}@{host}/{db_name}")
    command = f"psql -U {user} -d {db_name} -h {host} -p {port} -f {script_path}"
    return call(command, shell=True, env={
                                        'PGPASSWORD': password,
                                        'PGUSER': user,
                                        'PGHOST': host,
                                        'PGPORT': port,
                                        'PGDATABASE': db_name
    })

In [96]:
# Function to check schema existence
def check_schema_existence(connection_uri, schema_names):
    try:
        db_engine = create_db_engine(connection_uri)
        if db_engine is None:
            print("Failed to create the database engine.")
            return
        
        with db_engine.connect() as connection:
            for schema_name in schema_names:
                result = connection.execute(
                    text("SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                    {"schema": schema_name}
                )
                schema_exists = result.fetchone() is not None
                if schema_exists:
                    print(f"Schema '{schema_name}' exists in the database.")
                else:
                    print(f"Schema '{schema_name}' does not exist in the database.")
    
    except SQLAlchemyError as e:
        print(f"Error occurred while connecting to the database or executing query: {str(e)}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

## Executing

In [97]:
# Ingestion Parameters - Bronze Layer
schema_names = ['bronze', 'silver']

# Run create_schemas.sql
create_schemas_script_path = 'schemas/create_schemas.sql'
result = run_sql_script(create_schemas_script_path)
if result == 0:
    print("SQL script executed successfully. Schemas were created.")
    # Check if schemas exist in the database
    check_schema_existence(connection_uri, schema_names)
else:
    print("Error executing SQL script.")

myuser:mypassword@postgres/mydatabase
CREATE SCHEMA
CREATE SCHEMA
CREATE SCHEMA
SQL script executed successfully. Schemas were created.
Database engine created successfully.
Schema 'bronze' exists in the database.
Schema 'silver' exists in the database.
