# ETL Pipeline

## Importing Libraries

In [1]:
import pandas as pd # Data Transformation
from datetime import datetime
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

## Setting Up PostgreSQL Connection

### Loading Environmental Variables

In [2]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

### Creating a PostgreSQL Connection Engine with SQLAlchemy

In [3]:
# Define function to create an SQLAlchemy engine
def create_db_engine(connection_uri: str):
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    try:
        db_engine = create_engine(connection_uri)
        print("Database engine created successfully.")
    except SQLAlchemyError as e:
        print(f"Error occurred while creating the database engine: {str(e)}")
        return None
    # Log or handle the error as needed
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None
    
    
    return db_engine

## Extract

In [4]:
# Define function to extract data from CSV files
def extract(csv_folder_path):
    """
    Extract data from all CSV files in a folder, one by one.
    
    Args:
    - csv_folder_path (str): Path to the folder containing CSV files.
    
    Returns:
    - dict: A dictionary where keys are table names and values are DataFrames containing data from each CSV file.
    """
    # Test if a folder path exists
    if not os.path.exists(csv_folder_path):
        print(f"Folder '{csv_folder_path}' does not exist.")
        return None
    
    # Create a list of CSV files in the designated folder
    csv_files = [f for f in os.listdir(csv_folder_path) if f.endswith('.csv')]
    if not csv_files:
        print(f"No CSV files found in folder '{csv_folder_path}'.")
        return None
    
    # Create a dictionary where keys are table names and values are DataFrames containing data from each CSV file
    # This allows us to iterate over all the tables and perform specific transformations in the transform_raw() function  
    data_frames = {}

    # Iterating over each CSV file in the folder
    for csv_file in csv_files:
        # Separate the file name from the extension and store it
        table_name = os.path.splitext(csv_file)[0]  # Assuming table name is CSV filename without extension
        # Join CSV folder path with the CSV file name, inserting '/' as needed
        file_path = os.path.join(csv_folder_path, csv_file)
        try:
            df = pd.read_csv(file_path)
            print(f"-> CSV file '{csv_file}' loaded successfully.")
            
            # Add 'extracted_at' column with current timestamp
            df['extracted_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            
            # Store the CSV in DataFrame format as a value of the dictionary's key
            data_frames[table_name] = df
        except Exception as e:
            print(f"Error reading CSV file '{csv_file}': {str(e)}")
            data_frames[table_name] = None
    
    # Return the dictionary
    return data_frames

## Transform

**[Based on the CSV Files]**

**Customers**
* SignupDate: will be converted to datetime (to be used in PostgreSQL).

**Dates**
* DateID : will be converted from boolean to integer (to be used in PostgreSQL).

**Product Usage**
* Date column not available.

**Subscriptions**
* StartDate,EndDate: will be converted to datetime (to be used in PostgreSQL).
* Status: will be converted from boolean to integer (to be used in PostgreSQL).

**Support Interactions**
* Date column not available.

In [6]:
# Define function to transform raw data
def transform_raw(data_frames, date_columns_map):
    """
    Transform multiple raw DataFrames extracted from CSV files.
    Perform cleaning procedures: Convert specified date columns to pandas datetime and 
    boolean columns to integer.

    Args:
        data_frames (dict): A dictionary where keys are table names and values are DataFrames 
                            containing raw data extracted from CSV files.
        date_columns_map (dict): A dictionary where keys are table names and values are 
                                 column names to convert to pandas datetime.
        boolean_columns_map (dict): (I ommited this argument as there is no boolean column at this time) A dictionary where keys are table names and values are 
                                    column names to convert from boolean to integer.

    Returns:
        dict: A dictionary where keys are table names and values are cleaned DataFrames 
              with the specified transformations applied.
    """
    # Create a dictionary where keys are table names and values are the clean DataFrames after performing specific transformations.
    cleaned_data_frames = {}

    # Iterating over all tables. If table is not in the dict returned by the extract() function, then it is skipped for transformation.
    # df contains the actual data (in the DataFrame format) for each table
    for table_name, df in data_frames.items():
    
        try:
            if table_name in date_columns_map:
            # we specify the date columns in each table to perform transformations.
                # If a value of a key of date_columns_map is a single date column:
                #   Ex: date_columns == 'SignupDate', a string.
                # If a value of a key of date_columns_map is a list of date columns:
                #   Ex: date_columns == ['StartDate', 'EndDate'], a list.
                date_columns = date_columns_map[table_name] # accessing the value of the key 'table_name'
            
            # Note: the transform_raw() can receive a list of date columns, so we need to ensure the date_columns variable
            # is always treated as a list, even if a single date column name is provided.
                # If date_columns is a list, the condition is True
                # If date_columns is not a list, the condition is False, then it transforms it into a list.
                if not isinstance(date_columns, list):
                    date_columns = [date_columns]

                # Iterate over a potential list of columns (either single or multiple), one by one making the transformation.
                for date_column in date_columns:
                    # Check if the date column exists in the DataFrames that correspond to the data of each table. 
                    if date_column not in df.columns:
                        raise ValueError(f"Column '{date_column}' does not exist in the DataFrame for table '{table_name}'.")
                    
                    # Format the date column to 'YYYY-MM-DD' format
                    df[date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
                    print(f"Successfully converted column '{date_column}' to 'YYYY-MM-DD' format for table '{table_name}'.")
                    print(f"Data type after conversion: {df[date_column].dtype}")
                
                # Builds a DataFrame where date columns have been cleaned for each table, which is a key of this dict.
                # Each cleaned DataFrame is stored as a value of each table.
            
            cleaned_data_frames[table_name] = df

        except ValueError as ve:
            print(ve)
            # Indicates that an error occurred during the processing of the DataFrame for table_name and it
            # sets to None to signify that the data transformation or cleaning for that table was unsuccessful.
            cleaned_data_frames[table_name] = None
        except Exception as e:
            print(f"An error occurred when converting the date for table '{table_name}': {e}")
            cleaned_data_frames[table_name] = None
            
    # Returns a clean DataFrame when dates have been treated.
    return cleaned_data_frames

## Load

In [7]:
# Define function to ingest CSV data into the Bronze layer using DataFrame.to_sql
# Note: we use if_exists='replace'. This performs a full refresh of the table content (drop the table and ingest last updated data).
# Note: use if_exists='append' is you want to append data for the specific table.
def ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name, table_name, date_columns_map):
    """
    Extract, transform, and ingest CSV data into the Bronze layer of a PostgreSQL database.

    Args:
        csv_folder_path (str): Path to the folder containing CSV files.
        connection_uri (str): Connection URI for the PostgreSQL database.
        schema_name (str): Name of the schema in which tables exist or will be created.
        table_name (str): Name of the table to ingest data into.
        date_columns (list or str): Name(s) of the column(s) to convert to pandas datetime.

    Returns:
        None
    """

    print("Ingest Function.")

    # Calling the Extract Function for all CSV files
    # Returns: raw_data_dfs, a dict where keys are table names and values are DataFrames with raw data extracted from CSV files.
    print("Extract Function.")
    raw_data_dfs = extract(csv_folder_path)
    if raw_data_dfs is None:
        # Handle case where extraction fails
        print("Extraction failed.")
        return

    # Calling the Transformation Function
    # For each table stored in raw_data_dfs, the transform_raw() perform specific date transformation for each date column.
    # The date_columns_map is a dict where the key is the table name and the values are single date columns (in string format)
    # or multiple columns (in list format). Recall that the transform_raw() also ensures that the date columns are always treated
    # as a list for consistency.
    # It returns transformed_data_dfs, where keys are table names and values are the cleaned DataFrames.
    print("Transformation Function.")
    transformed_data_dfs = transform_raw(raw_data_dfs, date_columns_map)
    if transformed_data_dfs is None:
        # Handle case where transformation fails
        print("Error occurred during transformation. Processing aborted.")
        return

    try:
        # Create the database engine
        db_engine = create_db_engine(connection_uri)
        if db_engine is None:
            print("Failed to create the database engine.")
            return

        # Verify connection and schema existence
        with db_engine.connect() as connection:
            # Check if the schema exists
            result = connection.execute(
                text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                {"schema": schema_name}
            )
            schema_exists = result.fetchone() is not None
            if not schema_exists:
                raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
            print(f"Schema '{schema_name}' verified to exist.")

            # Set the search path to the specified schema
            connection.execute(text(f"SET search_path TO {schema_name};"))
            print(f"Search path set to schema '{schema_name}'.")
            
            # Iterate over transformed DataFrames and ingest data into the database
            for table_name, cleaned_data_df in transformed_data_dfs.items():
                if cleaned_data_df is None:
                    print(f"Skipping ingestion for table '{table_name}' due to previous errors.")
                    continue
                
                print(f"Ingesting data into {schema_name}.{table_name}...")

                # Add 'inserted_at' timestamp column
                cleaned_data_df['inserted_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                # Ingest data into the specified schema and table
                # Note: if_exists='replace' performs a full refresh of the table content (drop the table and ingest last updated data).
                # Note: use if_exists='append' is you want to append data for the specific table.
                cleaned_data_df.to_sql(table_name, db_engine, schema=schema_name, if_exists='replace', index=False)

                print(f"CSV data ingested successfully into {schema_name}.{table_name}.")

    except FileNotFoundError:
        print("Ingest Function: Error - CSV file not found.")
    except SQLAlchemyError as e:
        print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
    except ValueError as ve:
        print(f"ValueError: {str(ve)}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

In [8]:
# Bronze Ingestion parameters
csv_folder_path = '/workspace/data/raw'
schema_name_bronze = 'bronze'

# Tables in each schema
tables_in_bronze = ['customers', 'dates', 'product_usage', 'products', 'subscriptions', 'support_interactions']
# tables_in_silver = ['customer_transactions', 'customer_activity']
# tables_in_gold = ['customer_segmentation', 'churn_prediction']

# Define date columns for each table
date_columns_map = {
    'customers': 'SignupDate',
    'dates': 'Date',
    'subscriptions': ['StartDate', 'EndDate'],
}

# Ingest data into Bronze layer
ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name_bronze, tables_in_bronze, date_columns_map)
# for table_name in tables_in_bronze:
#     print(f"Ingesting '{table_name}' into {schema_name_bronze}...")
#     ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name_bronze, table_name, date_columns_map[table_name])

# Ingest data into Silver layer
# for table_name in tables_in_silver:
#     print(f"Ingesting '{table_name}' from {schema_name_bronze} to {schema_name_silver}...")
#     ingest_bronze_to_silver(connection_uri, schema_name_bronze, schema_name_silver, date_columns_map[table_name])

# Ingest data into Gold layer
# for table_name in tables_in_gold:
#     print(f"Ingesting '{table_name}' from {schema_name_silver} to {schema_name_gold}...")
#     ingest_silver_to_gold(connection_uri, schema_name_silver, schema_name_gold, date_columns_map[table_name])

print("All data ingested successfully into the Bronze Layer!")

Ingest Function.
Extract Function.
-> CSV file 'customers.csv' loaded successfully.
-> CSV file 'dates.csv' loaded successfully.
-> CSV file 'products.csv' loaded successfully.
-> CSV file 'product_usage.csv' loaded successfully.
-> CSV file 'subscriptions.csv' loaded successfully.
-> CSV file 'support_interactions.csv' loaded successfully.
Transformation Function.
Successfully converted column 'SignupDate' to 'YYYY-MM-DD' format for table 'customers'.
Data type after conversion: object
Successfully converted column 'Date' to 'YYYY-MM-DD' format for table 'dates'.
Data type after conversion: object
Successfully converted column 'StartDate' to 'YYYY-MM-DD' format for table 'subscriptions'.
Data type after conversion: object
Successfully converted column 'EndDate' to 'YYYY-MM-DD' format for table 'subscriptions'.
Data type after conversion: object
Database engine created successfully.
Schema 'bronze' verified to exist.
Search path set to schema 'bronze'.
Ingesting data into bronze.custom