## Importing Libraries

In [7]:
import pandas as pd # Data Transformation
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

## Extract

In [8]:
def extract(csv_file_path):
    """
    Create and return a raw DataFrame with data extracted from a raw CSV file.

    Args:
        csv_file_path: Local path to the CSV file.

    Returns:
        raw_data_json: a raw JSON with the API data.
    """
    print("Extract Function.")
    try:
        # Load CSV data into a pandas DataFrame
        df = pd.read_csv(csv_file_path)
        print("CSV file loaded successfully.")
    except FileNotFoundError:
        print("Error: CSV file not found.")
    
    # Return the dataframe
    return df

## Transform

In [9]:
def transform_raw(df, date_column, boolean_column):
    """
    Create and return a clean dataset in the DataFrame format based on the raw DataFrame extracted from a CSV file.
    The dataset will be loaded in the first layer of a medallion architecture, the bronze layer.
    Two cleaning procedures are defined: Converting a column to pandas datetime and another one from boolean to integer.

    Args:
        raw_data_df (DataFrame): The raw DataFrame extracted from a CSV file.
        date_column (str): The name of the column to convert to pandas datetime.
        boolean_column (str): The name of the column to convert from boolean to integer.

    Returns:
        df (dataFrame): A cleaned dataset with the specified transformations applied.
    """

    # Inner function to convert date to pandas datetime
    def convert_date_to_string(df, date_column):
        """
        Convert the specified date column in the DataFrame to a string format
        suitable for PostgreSQL date insertion.

        Args:
            df (DataFrame): The DataFrame containing the date column.
            date_column (str): The name of the date column to be converted.

        Returns:
            df (DataFrame): The DataFrame with the date column converted to string format.
        """
        print("Transform Function.")
        try:
            if date_column not in df.columns:
                raise ValueError(f"Column '{date_column}' does not exist in the DataFrame.")
            
            # Format the date column to 'YYYY-MM-DD' format
            df[date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
            print(f"Successfully converted column '{date_column}' to 'YYYY-MM-DD' format.")
            print(f"Data type after conversion: {df[date_column].dtype}")

        except ValueError as ve:
            print(ve)
        except Exception as e:
            print(f"An error occurred when converting the date: {e}")
        return df


    # Call the convert_date_to_string function
    cleaned_data_df_1 = convert_date_to_string(df, date_column)

    if cleaned_data_df_1 is None:
        print("Error occurred during date conversion. Cleaning process aborted.")
        return None

    # Inner function to convert boolean to integer
    def convert_boolean_to_integer(df, boolean_column):
        try:
            if boolean_column not in df.columns:
                raise ValueError(f"Column '{boolean_column}' does not exist in the DataFrame.")
            
            if df[boolean_column].dtype != 'bool':
                raise TypeError(f"Column '{boolean_column}' is not of boolean type.")
            
            df[boolean_column] = df[boolean_column].astype(int)
            print(f"Successfully converted column '{boolean_column}' from boolean to integer.")
        
        except ValueError as ve:
            print(ve)
            return None
        except TypeError as te:
            print(te)
            return None
        except Exception as e:
            print(f"An unexpected error occurred during boolean to integer conversion: {e}")
            return None
        
        return  df

    # Call the convert_boolean_to_integer function
    cleaned_data_final_df = convert_boolean_to_integer(cleaned_data_df_1, boolean_column)
    print(cleaned_data_final_df.dtypes)
    if cleaned_data_final_df is None:
        print("Error occurred during boolean to integer conversion. Cleaning process aborted.")
        return None

    return df

## Load

In [10]:
# Load environment variables from .env file
load_dotenv()

# Retrieve individual components from environment variables
user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_HOST')
db_name = os.getenv('POSTGRES_DB')

# Ensure the connection URI is retrieved successfully
if not all([user, password, host, db_name]):
    raise ValueError("One or more environment variables for the database connection are not set")

# Construct the connection URI
connection_uri = f"postgresql://{user}:{password}@{host}/{db_name}"

# Ensure the connection URI is retrieved successfully
if connection_uri is None:
    raise ValueError("DATABASE_URL environment variable is not set")

In [11]:
# Create a Function that created the SQL Engine based on SQLAlchemy
def create_db_engine(connection_uri: str):
    """
    Create and return a SQLAlchemy engine based on the provided connection URI.

    Args:
        connection_uri (str): The connection URI for the database.

    Returns:
        Engine: A SQLAlchemy engine connected to the specified database.
    """
    try:
        db_engine = create_engine(connection_uri)
        print("Database engine created successfully.")
    except SQLAlchemyError as e:
        print(f"Error occurred while creating the database engine: {str(e)}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None
    
    return db_engine

In [12]:
def ingest_csv_to_bronze(csv_file_path, connection_uri, schema_name, table_name):
    print("Ingest Function.")
    
    # Calling the Extract Function
    raw_data_df = extract(csv_file_path)
    if raw_data_df is None:
        return

    # Calling the Transformation Function
    cleaned_data_df = transform_raw(raw_data_df, 'last_purchase_date', 'churn_status')
    if cleaned_data_df is None:
        return
    
    # Checking the date type
    cleaned_data_df[['last_purchase_date', 'churn_status']].dtypes

    try:
        
        # Create the database engine
        db_engine = create_db_engine(connection_uri)
        if db_engine is None:
            print("Failed to create the database engine.")
            return
        
        # Verify connection and schema existence
        with db_engine.connect() as connection:
            # Check if the schema exists
            result = connection.execute(
                text(f"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema"),
                {"schema": schema_name}
            )
            schema_exists = result.fetchone() is not None
            if not schema_exists:
                raise ValueError(f"Schema '{schema_name}' does not exist in the database.")
            print(f"Schema '{schema_name}' verified to exist.")

            # Set the search path to the specified schema
            connection.execute(text(f"SET search_path TO {schema_name};"))
            print(f"Search path set to schema '{schema_name}'.")

            # Generate and execute SQL insert statements for each row in the DataFrame
            for index, row in cleaned_data_df.iterrows():
                values = ", ".join([f"'{value}'" if isinstance(value, str) else str(value) for value in row.values])
                insert_statement = f"INSERT INTO {table_name} ({', '.join(cleaned_data_df.columns)}) VALUES ({values});"
                connection.execute(text(insert_statement))

            print(f"CSV data ingested successfully into {schema_name}.{table_name}.")

            # Commit the transaction
            connection.commit()

            print("Transaction committed successfully!")

            # Query to verify the data was inserted
            verification_query = f"SELECT * FROM {table_name} LIMIT 5;"
            result = connection.execute(text(verification_query))
            data = result.fetchall()
            print(f"Verification Query Result: {data}")

    except FileNotFoundError:
        print("Ingest Function: Error - CSV file not found.")
    except SQLAlchemyError as e:
        print(f"Error occurred while connecting to the database or ingesting data: {str(e)}")
    except ValueError as ve:
        print(f"ValueError: {str(ve)}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

# Usage example:
csv_file_path = '/workspace/data/customer_churn.csv'
schema_name = 'churn_bronze'
table_name = 'customer_data'

ingest_csv_to_bronze(csv_file_path, connection_uri, schema_name, table_name)

Ingest Function.
Extract Function.
CSV file loaded successfully.
Transform Function.
Successfully converted column 'last_purchase_date' to 'YYYY-MM-DD' format.
Data type after conversion: object
Successfully converted column 'churn_status' from boolean to integer.
customer_id            int64
age                    int64
gender                object
total_transactions     int64
last_purchase_date    object
churn_status           int64
dtype: object
Database engine created successfully.
Schema 'churn_bronze' verified to exist.
Search path set to schema 'churn_bronze'.
CSV data ingested successfully into churn_bronze.customer_data.
Transaction committed successfully!
Verification Query Result: [(1, 25, 'Male', 10, datetime.date(2022, 5, 15), 0), (2, 35, 'Female', 15, datetime.date(2022, 6, 20), 1), (3, 40, 'Male', 20, datetime.date(2022, 7, 10), 0), (4, 30, 'Female', 8, datetime.date(2022, 8, 5), 1), (5, 45, 'Male', 12, datetime.date(2022, 9, 12), 0)]
