## Importing Libraries

In [1]:
import pandas as pd # Data Transformation
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

# ETL Pipeline

## Extract

In [2]:
def extract(csv_folder_path):
    """
    Extract data from all CSV files in a folder, one by one.
    
    Args:
    - csv_folder_path (str): Path to the folder containing CSV files.
    
    Returns:
    - dict: A dictionary where keys are table names and values are DataFrames containing data from each CSV file.
    """
    print("Extract Function.")
    if not os.path.exists(csv_folder_path):
        print(f"Folder '{csv_folder_path}' does not exist.")
        return None
    
    # List of CSV files in the designated folder
    csv_files = [f for f in os.listdir(csv_folder_path) if f.endswith('.csv')]
    if not csv_files:
        print(f"No CSV files found in folder '{csv_folder_path}'.")
        return None
    
    data_frames = {}

    for csv_file in csv_files:
        # Separate the file name from the extension and store it
        table_name = os.path.splitext(csv_file)[0]  # Assuming table name is CSV filename without extension
        # Join CSV folder path with the CSV file name, inserting '/' as needed
        file_path = os.path.join(csv_folder_path, csv_file)
        try:
            df = pd.read_csv(file_path)
            print(f"CSV file '{csv_file}' loaded successfully.")
            # Store the CSV in DataFrame format as a value of the dictionary's key
            data_frames[table_name] = df
        except Exception as e:
            print(f"Error reading CSV file '{csv_file}': {str(e)}")
            data_frames[table_name] = None
    
    # Return the dictionary
    return data_frames

In [3]:
# # Test Extraction
# csv_folder_path = '/workspace/data/raw'
# raw_data_dfs = extract(csv_folder_path)

# if raw_data_dfs is None:
#     # Handle case where extraction fails
#     print("Extraction failed.")
# else:
#     # Process extracted DataFrames
#     for table_name, df in raw_data_dfs.items():
#         if df is None:
#             print(f"Error: DataFrame for '{table_name}' is None.")
#         else:
#             print(f"\nDataFrame for '{table_name}':")
#             print(df.head(2))  # Display first few rows as a check

## Transform

**[CSV Files]**

**Customers**
* SignupDate: will be converted to datetime (to be used in PostgreSQL).

**Subscriptions**
* StartDate,EndDate: will be converted to datetime (to be used in PostgreSQL).
* Status: will be converted from boolean to integer (to be used in PostgreSQL).

**Product Usage**
* DateID : no need to converted to datetime (to be used in PostgreSQL) because it is not used in the transformation layer.

**Support Interactions**
* DateID : no need to converted to datetime (to be used in PostgreSQL) because it is not used in the transformation layer.

**Dates**
* DateID : no need to converted to datetime (to be used in PostgreSQL) because it is not used in the transformation layer.

In [4]:
def transform_raw(data_frames, date_columns_map):
    """
    Transform multiple raw DataFrames extracted from CSV files.
    Perform cleaning procedures: Convert specified date columns to pandas datetime and 
    boolean columns to integer.

    Args:
        data_frames (dict): A dictionary where keys are table names and values are DataFrames 
                            containing raw data extracted from CSV files.
        date_columns_map (dict): A dictionary where keys are table names and values are 
                                 column names to convert to pandas datetime.
        boolean_columns_map (dict): (I ommited this argument as there is no boolean column at this time) A dictionary where keys are table names and values are 
                                    column names to convert from boolean to integer.

    Returns:
        dict: A dictionary where keys are table names and values are cleaned DataFrames 
              with the specified transformations applied.
    """
    cleaned_data_frames = {}

    print("Transform Function.")
    for table_name, df in data_frames.items():
        if table_name not in date_columns_map:
            print(f"Skipping table '{table_name}' due to missing date column mapping.")
            continue
        
        # If there is a boolean, column, use this below intead of the above
        # if table_name not in date_columns_map or table_name not in boolean_columns_map:
        #     print(f"Skipping table '{table_name}' due to missing date or boolean column mapping.")
        #     continue

        # Specifying the date and boolean columns in each table to perform transformations
        date_columns = date_columns_map[table_name]
        # date_columns = date_columns_map.get(table_name)
        # There is no boolean column at this time
        # boolean_column = boolean_columns_map[table_name]
        
        # Ensure that the date_columns variable is always treated as a list, even if a single date column name is provided.
        if not isinstance(date_columns, list):
            date_columns = [date_columns]

        try:
            for date_column in date_columns:
                if date_column not in df.columns:
                    raise ValueError(f"Column '{date_column}' does not exist in the DataFrame for table '{table_name}'.")
                
                # Format the date column to 'YYYY-MM-DD' format
                df[date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
                print(f"Successfully converted column '{date_column}' to 'YYYY-MM-DD' format for table '{table_name}'.")
                print(f"Data type after conversion: {df[date_column].dtype}")
            
            cleaned_data_frames[table_name] = df

        # # Inner function to convert date to pandas datetime
        # def convert_date_to_string(df, date_columns, table_name):
        #     """
        #     Convert the specified date columns in the DataFrame to a string format
        #     suitable for PostgreSQL date insertion.

        #     Args:
        #         df (DataFrame): The DataFrame containing the date columns.
        #         date_columns (list): A list of date column names to be converted.
        #         table_name (str): The name of the table being processed.

        #     Returns:
        #         df (DataFrame): The DataFrame with the date columns converted to string format.
        #     """
        #     try:
        #         for date_column in date_columns:
        #             if date_column not in df.columns:
        #                 raise ValueError(f"Column '{date_column}' does not exist in the DataFrame for table '{table_name}'.")
                    
        #             # Format the date column to 'YYYY-MM-DD' format
        #             df[date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
        #             print(f"Successfully converted column '{date_column}' to 'YYYY-MM-DD' format for table '{table_name}'.")
        #             print(f"Data type after conversion: {df[date_column].dtype}")
        #     except ValueError as ve:
        #         print(ve)
        #     except Exception as e:
        #         print(f"An error occurred when converting the date for table '{table_name}': {e}")
        #     return df

        # # Call the convert_date_to_string function
        # converted_date_df = convert_date_to_string(df, date_columns, table_name)

        # if converted_date_df is None:
        #     print(f"Error occurred during date conversion for table '{table_name}'. Cleaning process aborted.")
        #     return None

        # There is no boolean column at this time
        # # Inner function to convert boolean to integer
        # def convert_boolean_to_integer(df, boolean_column):
        #     try:
        #         if boolean_column not in df.columns:
        #             raise ValueError(f"Column '{boolean_column}' does not exist in the DataFrame for table '{table_name}'.")
                
        #         if df[boolean_column].dtype != 'bool':
        #             raise TypeError(f"Column '{boolean_column}' is not of boolean type for table '{table_name}'.")
                
        #         df[boolean_column] = df[boolean_column].astype(int)
        #         print(f"Successfully converted column '{boolean_column}' from boolean to integer for table '{table_name}'.")
            
        #     except ValueError as ve:
        #         print(ve)
        #         return None
        #     except TypeError as te:
        #         print(te)
        #         return None
        #     except Exception as e:
        #         print(f"An unexpected error occurred during boolean to integer conversion for table '{table_name}': {e}")
        #         return None
        
        #     return  df

        # # Call the convert_boolean_to_integer function
        # cleaned_data_final_df = convert_boolean_to_integer(converted_date_df, boolean_column)
        # print(cleaned_data_final_df.dtypes)

        # if cleaned_data_final_df is None:
        #     print(f"Error occurred during boolean to integer conversion for table '{table_name}'. Cleaning process aborted.")
        #     return None
        
        # Store the cleaned DataFrame for table '{table_name}' in the dictionary
        # cleaned_data_frames[table_name] = converted_date_df

        except ValueError as ve:
            print(ve)
        except Exception as e:
            print(f"An error occurred when converting the date for table '{table_name}': {e}")
            cleaned_data_frames[table_name] = None

    return cleaned_data_frames

In [5]:
# # Test the transform function

# csv_folder_path = '/workspace/data/raw'
# raw_data_dfs = extract(csv_folder_path)
# date_columns_map = {'customers': 'SignupDate', 'subscriptions': ['StartDate', 'EndDate']}
# # There is no boolean column at this time
# # boolean_columns_map = {'subscriptions': 'Status'}

# transformed_data_dfs = transform_raw(raw_data_dfs, date_columns_map)

# if transformed_data_dfs is None:
#     print("Error occurred during transformation. Processing aborted.")
# else:
#     for table_name, df in transformed_data_dfs.items():
#         print(f"Transformed DataFrame for table '{table_name}':")
#         print(df.head(2))  # Display first few rows as a check


## Load

In [6]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

In [7]:
# Create a Function that created the SQL Engine based on SQLAlchemy
def create_db_engine(connection_uri: str):
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    try:
        db_engine = create_engine(connection_uri)
        print("Database engine created successfully.")
    except SQLAlchemyError as e:
        print(f"Error occurred while creating the database engine: {str(e)}")
        return None
    # Log or handle the error as needed
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None
    
    
    return db_engine

In [8]:
# def ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name, table_name, date_columns):
#     """
#     Extract, transform, and ingest CSV data into the Bronze layer of a PostgreSQL database.

#     Args:
#         csv_folder_path (str): Path to the folder containing CSV files.
#         connection_uri (str): Connection URI for the PostgreSQL database.
#         schema_name (str): Name of the schema in which tables exist or will be created.
#         table_name (str): Name of the table to ingest data into.
#         date_columns (list or str): Name(s) of the column(s) to convert to pandas datetime.

#     Returns:
#         None
#     """
#     # Calling the Extract Function for all CSV files
#     print("Ingest Function.")
#     raw_data_dfs = extract(csv_folder_path)

#     if raw_data_dfs is None:
#         # Handle case where extraction fails
#         print("Extraction failed.")
#     else:
#         # Process extracted DataFrames
#         for table_name, df in raw_data_dfs.items():
#             if df is None:
#                 print(f"Error: DataFrame for '{table_name}' is None.")
#             else:
#                 print(f"\n (CSV) Extracted DataFrame for '{table_name}':")
#                 print(df.head(2))  # Display first few rows as a check

#     # Calling the Transformation Function
#     print("Transformation Function.")
#     # There is no boolean column at this time
#     # boolean_columns_map = {'subscriptions': 'Status'}
    
#     transformed_data_dfs = transform_raw(raw_data_dfs, {table_name: date_columns})

#     if transformed_data_dfs is None:
#         print("Error occurred during transformation. Processing aborted.")
#     else:
#         for table_name, df in transformed_data_dfs.items():
#             print(f"Transformed DataFrame for table '{table_name}':")
#             print(df.head(2))  # Display first few rows as a check

#     try:  
#         # Create the database engine
#         db_engine = create_db_engine(connection_uri)
#         if db_engine is None:
#             print("Failed to create the database engine.")
#             return
        
#         # Verify connection and schema existence
#         with db_engine.connect() as connection:
#             # Check if the schema exists
#             result = connection.execute(
#                 text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
#                 {"schema": schema_name}
#             )
#             schema_exists = result.fetchone() is not None
#             if not schema_exists:
#                 raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
#             print(f"Schema '{schema_name}' verified to exist.")

#             # Set the search path to the specified schema
#             connection.execute(text(f"SET search_path TO {schema_name};"))
#             print(f"Search path set to schema '{schema_name}'.")

#             # Iterate over transformed DataFrames and ingest data into the database
#             for table_name, cleaned_data_df in transformed_data_dfs.items():
#                 print(f"Ingesting data into {schema_name}.{table_name}...")
            
#             # Generate and execute SQL insert statements for each row in the DataFrame
#             for index, row in cleaned_data_df.iterrows():
#                 values = ", ".join([f"'{value}'" if isinstance(value, str) else str(value) for value in row.values])
#                 insert_statement = f"INSERT INTO {table_name} ({', '.join(cleaned_data_df.columns)}) VALUES ({values});"
#                 connection.execute(text(insert_statement))

#             print(f"CSV data ingested successfully into {schema_name}.{table_name}.")

#             # Commit the transaction
#             connection.commit()
#             print("Transaction committed successfully!")

#             # Query to verify the data was inserted
#             for table_name in transformed_data_dfs.keys():
#                 verification_query = f"SELECT * FROM {table_name} LIMIT 2;"
#                 result = connection.execute(text(verification_query))
#                 data = result.fetchall()
#                 print(f"Verification Query Result for table '{table_name}': {data}")

#     except FileNotFoundError:
#         print("Ingest Function: Error - CSV file not found.")
#     except SQLAlchemyError as e:
#         print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
#     except ValueError as ve:
#         print(f"ValueError: {str(ve)}")
#     except Exception as e:
#         print(f"An unexpected error occurred: {str(e)}")

In [9]:
def ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name, table_name, date_columns):
    """
    Extract, transform, and ingest CSV data into the Bronze layer of a PostgreSQL database.

    Args:
        csv_folder_path (str): Path to the folder containing CSV files.
        connection_uri (str): Connection URI for the PostgreSQL database.
        schema_name (str): Name of the schema in which tables exist or will be created.
        table_name (str): Name of the table to ingest data into.
        date_columns (list or str): Name(s) of the column(s) to convert to pandas datetime.

    Returns:
        None
    """
    try:
        # Calling the Extract Function for all CSV files
        print("Ingest Function.")
        raw_data_dfs = extract(csv_folder_path)

        if raw_data_dfs is None:
            # Handle case where extraction fails
            print("Extraction failed.")
            return

        # Calling the Transformation Function
        print("Transformation Function.")
        transformed_data_dfs = transform_raw(raw_data_dfs, {table_name: date_columns})

        if transformed_data_dfs is None:
            print("Error occurred during transformation. Processing aborted.")
            return

        # Create the database engine
        db_engine = create_db_engine(connection_uri)
        if db_engine is None:
            print("Failed to create the database engine.")
            return

        with db_engine.connect() as connection:
            # Check if the schema exists
            result = connection.execute(
                text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                {"schema": schema_name}
            )
            schema_exists = result.fetchone() is not None
            if not schema_exists:
                raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
            print(f"Schema '{schema_name}' verified to exist.")

            # Set the search path to the specified schema
            connection.execute(text(f"SET search_path TO {schema_name};"))
            print(f"Search path set to schema '{schema_name}'.")

            # Iterate over transformed DataFrames and ingest data into the database
            for table_name, cleaned_data_df in transformed_data_dfs.items():
                print(f"Ingesting data into {schema_name}.{table_name}...")

                # Generate and execute SQL insert statements for each row in the DataFrame
                for index, row in cleaned_data_df.iterrows():
                    values = ", ".join([f"'{value}'" if isinstance(value, str) else str(value) for value in row.values])
                    insert_statement = f"INSERT INTO {table_name} ({', '.join(cleaned_data_df.columns)}) VALUES ({values});"
                    connection.execute(text(insert_statement))

                print(f"CSV data ingested successfully into {schema_name}.{table_name}.")

                # Commit the transaction
                connection.commit()
                print("Transaction committed successfully!")

                # Query to verify the data was inserted
                verification_query = f"SELECT * FROM {table_name} LIMIT 2;"
                result = connection.execute(text(verification_query))
                data = result.fetchall()
                print(f"Verification Query Result for table '{table_name}': {data}")

    except FileNotFoundError:
        print("Ingest Function: Error - CSV file not found.")
    except SQLAlchemyError as e:
        print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
    except ValueError as ve:
        print(f"ValueError: {str(ve)}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

In [10]:
# Bronze Ingestion parameters
csv_folder_path = '/workspace/data/raw'
schema_name_bronze = 'bronze'
# schema_name_silver = 'silver'
# schema_name_gold = 'gold'

# Tables in each schema
tables_in_bronze = ['customers', 'subscriptions']
# tables_in_silver = ['customer_transactions', 'customer_activity']
# tables_in_gold = ['customer_segmentation', 'churn_prediction']

# Define date columns for each table
date_columns_map = {
    'customers': 'SignupDate',
    'subscriptions': ['StartDate', 'EndDate']
}

# Ingest data into Bronze layer
for table_name in tables_in_bronze:
    print(f"Ingesting '{table_name}' into {schema_name_bronze}...")
    ingest_csv_to_bronze(csv_folder_path, connection_uri, schema_name_bronze, table_name, date_columns_map[table_name])

# Ingest data into Silver layer
# for table_name in tables_in_silver:
#     print(f"Ingesting '{table_name}' from {schema_name_bronze} to {schema_name_silver}...")
#     ingest_bronze_to_silver(connection_uri, schema_name_bronze, schema_name_silver, date_columns_map[table_name])

# Ingest data into Gold layer
# for table_name in tables_in_gold:
#     print(f"Ingesting '{table_name}' from {schema_name_silver} to {schema_name_gold}...")
#     ingest_silver_to_gold(connection_uri, schema_name_silver, schema_name_gold, date_columns_map[table_name])

print("All data ingested successfully into the Bronze Layer!")

Ingesting 'customers' into bronze...
Ingest Function.
Extract Function.
CSV file 'customers.csv' loaded successfully.
CSV file 'dates.csv' loaded successfully.
CSV file 'product_usage.csv' loaded successfully.
CSV file 'subscriptions.csv' loaded successfully.
CSV file 'support_interactions.csv' loaded successfully.
Transformation Function.
Transform Function.
Successfully converted column 'SignupDate' to 'YYYY-MM-DD' format for table 'customers'.
Data type after conversion: object
Skipping table 'dates' due to missing date column mapping.
Skipping table 'product_usage' due to missing date column mapping.
Skipping table 'subscriptions' due to missing date column mapping.
Skipping table 'support_interactions' due to missing date column mapping.
Database engine created successfully.
Schema 'bronze' verified to exist.
Search path set to schema 'bronze'.
Ingesting data into bronze.customers...
CSV data ingested successfully into bronze.customers.
Transaction committed successfully!
Verifica