# ETL Pipeline

## Importing Libraries

In [None]:
import pandas as pd # Data Transformation
from datetime import datetime
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

## Setting Up PostgreSQL Connection

### Loading Environmental Variables

In [None]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

### Creating a PostgreSQL Connection Engine with SQLAlchemy

In [None]:
# Define function to create an SQLAlchemy engine
def create_db_engine(connection_uri: str):
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    try:
        db_engine = create_engine(connection_uri)
        print("Database engine created successfully.")
    except SQLAlchemyError as e:
        print(f"Error occurred while creating the database engine: {str(e)}")
        return None
    # Log or handle the error as needed
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None
    
    
    return db_engine

In [None]:
# Function to execute create_silver_views.sql
def create_silver_views(connection_uri, silver_schema, script_path):
    try:
        with open(script_path, 'r') as file:
            sql_script = file.read()
        
        engine = create_db_engine(connection_uri)
        with engine.connect() as connection:
            # Set the search path to the silver schema
            connection.execute(text(f"SET search_path TO {silver_schema};"))
            # Execute the script to create views
            connection.execute(text(sql_script))
            
        print("Silver views created successfully.")
    except FileNotFoundError:
        print("create_silver_views.sql not found.")
    except Exception as e:
        print(f"Error executing create_silver_views.sql: {str(e)}")

## Transform

**[Based on the CSV Files]**

**table**
* xyz

In [None]:
# # Define function to transform raw data
# def transform_raw(data_frames, date_columns_map):
#     """
#     Transform multiple raw DataFrames extracted from CSV files.
#     Perform cleaning procedures: Convert specified date columns to pandas datetime and 
#     boolean columns to integer.

#     Args:
#         data_frames (dict): A dictionary where keys are table names and values are DataFrames 
#                             containing raw data extracted from CSV files.
#         date_columns_map (dict): A dictionary where keys are table names and values are 
#                                  column names to convert to pandas datetime.
#         boolean_columns_map (dict): (I ommited this argument as there is no boolean column at this time) A dictionary where keys are table names and values are 
#                                     column names to convert from boolean to integer.

#     Returns:
#         dict: A dictionary where keys are table names and values are cleaned DataFrames 
#               with the specified transformations applied.
#     """
#     # Create a dictionary where keys are table names and values are the clean DataFrames after performing specific transformations.
#     cleaned_data_frames = {}

#     # Iterating over all tables. If table is not in the dict returned by the extract() function, then it is skipped for transformation.
#     # df contains the actual data (in the DataFrame format) for each table
#     for table_name, df in data_frames.items():
    
#         try:
#             if table_name in date_columns_map:
#             # we specify the date columns in each table to perform transformations.
#                 # If a value of a key of date_columns_map is a single date column:
#                 #   Ex: date_columns == 'SignupDate', a string.
#                 # If a value of a key of date_columns_map is a list of date columns:
#                 #   Ex: date_columns == ['StartDate', 'EndDate'], a list.
#                 date_columns = date_columns_map[table_name] # accessing the value of the key 'table_name'
            
#             # Note: the transform_raw() can receive a list of date columns, so we need to ensure the date_columns variable
#             # is always treated as a list, even if a single date column name is provided.
#                 # If date_columns is a list, the condition is True
#                 # If date_columns is not a list, the condition is False, then it transforms it into a list.
#                 if not isinstance(date_columns, list):
#                     date_columns = [date_columns]

#                 # Iterate over a potential list of columns (either single or multiple), one by one making the transformation.
#                 for date_column in date_columns:
#                     # Check if the date column exists in the DataFrames that correspond to the data of each table. 
#                     if date_column not in df.columns:
#                         raise ValueError(f"Column '{date_column}' does not exist in the DataFrame for table '{table_name}'.")
                    
#                     # Format the date column to 'YYYY-MM-DD' format
#                     df[date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
#                     print(f"Successfully converted column '{date_column}' to 'YYYY-MM-DD' format for table '{table_name}'.")
#                     print(f"Data type after conversion: {df[date_column].dtype}")
                
#                 # Builds a DataFrame where date columns have been cleaned for each table, which is a key of this dict.
#                 # Each cleaned DataFrame is stored as a value of each table.
            
#             cleaned_data_frames[table_name] = df

#         except ValueError as ve:
#             print(ve)
#             # Indicates that an error occurred during the processing of the DataFrame for table_name and it
#             # sets to None to signify that the data transformation or cleaning for that table was unsuccessful.
#             cleaned_data_frames[table_name] = None
#         except Exception as e:
#             print(f"An error occurred when converting the date for table '{table_name}': {e}")
#             cleaned_data_frames[table_name] = None
            
#     # Returns a clean DataFrame when dates have been treated.
#     return cleaned_data_frames

## Load

In [None]:
# Function for ingesting from Bronze to Silver
def ingest_bronze_to_silver(connection_uri, bronze_schema, silver_schema, transformed_data_dfs):
    try:
        # Create database engine
        engine = create_db_engine(connection_uri)
        if engine is None:
            print("Failed to create the database engine.")
            return

        # Verify connection and schema existence
        with engine.connect() as connection:
            # Check if the bronze and silver schemas exist
            for schema_name in [bronze_schema, silver_schema]:
                result = connection.execute(
                    text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                    {"schema": schema_name}
                )
                schema_exists = result.fetchone() is not None
                if not schema_exists:
                    raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
                print(f"Schema '{schema_name}' verified to exist.")

            # Set the search path to the silver schema
            connection.execute(text(f"SET search_path TO {silver_schema};"))
            print(f"Search path set to schema '{silver_schema}'.")

            # Iterate over transformed DataFrames and ingest data into the database
            for table_name, cleaned_data_df in transformed_data_dfs.items():
                if cleaned_data_df is None:
                    print(f"Skipping ingestion for table '{table_name}' due to previous errors.")
                    continue
                
                print(f"Ingesting data into {silver_schema}.{table_name}...")

                # Add 'inserted_at' timestamp column
                cleaned_data_df['inserted_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                # Ingest data into the specified schema and table
                # Note: if_exists='replace' performs a full refresh of the table content (drop the table and ingest last updated data).
                # Note: use if_exists='append' is you want to append data for the specific table.
                cleaned_data_df.to_sql(table_name, engine, schema=silver_schema, if_exists='replace', index=False)

                print(f"Transformed data ingested successfully into {silver_schema}.{table_name}.")

    except FileNotFoundError:
        print("Ingest Function: Error - CSV file not found.")
    except Exception as e:
        print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
    except ValueError as ve:
        print(f"ValueError: {str(ve)}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

In [None]:
# Bronze Ingestion parameters
csv_folder_path = '/workspace/data/raw'
schema_name_bronze = 'bronze'
schema_name_silver = 'silver'

# Tables in each schema
tables_in_bronze = ['customers', 'dates', 'product_usage', 'products', 'subscriptions', 'support_interactions']
tables_in_silver = ['dim_customers', 'dim_dates', 'dim_products']
# tables_in_gold = ['customer_segmentation', 'churn_prediction']

# Define date columns for each table
date_columns_map = {
    'customers': 'SignupDate',
    'dates': 'Date',
    'subscriptions': ['StartDate', 'EndDate'],
}

# Ingest data into Bronze layer
ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name_bronze, tables_in_bronze, date_columns_map)
# for table_name in tables_in_bronze:
#     print(f"Ingesting '{table_name}' into {schema_name_bronze}...")
#     ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name_bronze, table_name, date_columns_map[table_name])

# Ingest data into Silver layer
# for table_name in tables_in_silver:
#     print(f"Ingesting '{table_name}' from {schema_name_bronze} to {schema_name_silver}...")
#     ingest_bronze_to_silver(connection_uri, schema_name_bronze, schema_name_silver, date_columns_map[table_name])

# Ingest data into Gold layer
# for table_name in tables_in_gold:
#     print(f"Ingesting '{table_name}' from {schema_name_silver} to {schema_name_gold}...")
#     ingest_silver_to_gold(connection_uri, schema_name_silver, schema_name_gold, date_columns_map[table_name])

print("All data ingested successfully into the Bronze Layer!")