## Data Ingestion

After our data is loaded to the database, the next step is to ingest the required it. To simplyfy the workflow I will ingest the whole database, but you could also only ingest what was loaded by the load component of the ETL pipeline; this approach would allow you to perform online learning. With the appraoch I will implement, we can train several models with different amounts of data.

In [1]:
import os
os.chdir("../")

### Create the entity configuration class for data ingestion

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    """
    Configuration class for data ingestion parameters.

    Args:
        root_dir (Path): Directory where Ingestion artifacts will be stored
        local_data_file (Path): Path where the data will be stored
    """
    root_dir: Path
    local_data_file: Path


### Modify configuration manager so that it handles ingestion configurations

In [3]:
from src.datascience.constants import * 
from src.datascience.utils.common import read_yaml, create_directories
from src.datascience import logger

class ConfigurationManager:
    """
    Configuration manager for handling YAML configuration files.
    
    This class loads configuration, parameters, and schema files and provides
    methods to retrieve specific configuration objects.
    """
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath= PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH ):

        """
        Initialize the ConfigurationManager.
        
        Args:
            config_filepath (Path): Path to the main configuration file
            params_filepath (Path): Path to the parameters file
            schema_filepath (Path): Path to the schema file
        """
          
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            
            # Create artifacts root directory
            create_directories([self.config.artifacts_root])
            logger.info("ConfigurationManager initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing ConfigurationManager: {e}")
            raise


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            local_data_file = config.local_data_file,
        )

        return data_ingestion_config

[2025-08-05 16:06:08,039: INFO: __init__: Logger initialized for the datascience package.]


### Create the data ingestion Component

In [4]:
## Component-Data Ingestion
import urllib.request as request
import pandas as pd
import psycopg2
import os
from dotenv import load_dotenv
from src.datascience import logger

class DataIngestion:
    """
    Component for data ingestion operations including querying data from the database and
    saving it as a csv file.
    """
    def __init__(self, config:DataIngestionConfig):

        """
        Initialize DataIngestion with configuration.
        
        Args:
            config (DataIngestionConfig): Configuration object containing data ingestion parameters
        """
             
        self.config = config
    
    def ingest_data(self):
        """
        Gets the data from a RDS instance and saves it to a csv file
        """
        load_dotenv()

        try:
            conn = psycopg2.connect(
                host=os.getenv("POSTGRES_HOST"),
                database=os.getenv("POSTGRES_DB"),
                user=os.getenv("POSTGRES_USER"),
                password=os.getenv("POSTGRES_PASSWORD")
            )   
        
            df = pd.read_sql("SELECT * FROM weather_data", conn) # Get data from databases
            df.to_csv(self.config.local_data_file, index=False) # Save data to csv

                
        except Exception as e:
            logger.error(f"Error  ingesting data: {e}")
            raise

In [5]:
try:
    config= ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.ingest_data()
except Exception as e:
    raise e

[2025-08-05 16:06:09,425: INFO: common: YAML file: config/config.yaml loaded successfully]
[2025-08-05 16:06:09,426: INFO: common: YAML file: params.yaml loaded successfully]
[2025-08-05 16:06:09,427: INFO: common: YAML file: schema.yaml loaded successfully]
[2025-08-05 16:06:09,428: INFO: common: Created directory at artifacts]
[2025-08-05 16:06:09,429: INFO: 3601399614: ConfigurationManager initialized successfully]
[2025-08-05 16:06:09,429: INFO: common: Created directory at artifacts/data_ingestion]


  df = pd.read_sql("SELECT * FROM weather_data", conn) # Get data from databases
