In [1]:
import os

In [2]:
%pwd

'/home/armando-albornoz/Desktop/ml/MLOPS_course/project1/datascienceendtoend1/research'

In [3]:
os.chdir("../")
%pwd

'/home/armando-albornoz/Desktop/ml/MLOPS_course/project1/datascienceendtoend1'

In [4]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/Employers_data.csv")
data.head()

Unnamed: 0,Employee_ID,Name,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location,Salary
0,1,Merle Ingram,24,Female,Engineering,Engineer,1,Master,Austin,90000
1,2,John Mayes,56,Male,Sales,Executive,33,Master,Seattle,195000
2,3,Carlos Wille,21,Male,Engineering,Intern,1,Bachelor,New York,35000
3,4,Michael Bryant,30,Male,Finance,Analyst,9,Bachelor,New York,75000
4,5,Paula Douglas,25,Female,HR,Analyst,2,Master,Seattle,70000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Employee_ID       10000 non-null  int64 
 1   Name              10000 non-null  object
 2   Age               10000 non-null  int64 
 3   Gender            10000 non-null  object
 4   Department        10000 non-null  object
 5   Job_Title         10000 non-null  object
 6   Experience_Years  10000 non-null  int64 
 7   Education_Level   10000 non-null  object
 8   Location          10000 non-null  object
 9   Salary            10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [6]:
# check null
data.isnull().sum()

Employee_ID         0
Name                0
Age                 0
Gender              0
Department          0
Job_Title           0
Experience_Years    0
Education_Level     0
Location            0
Salary              0
dtype: int64

In [7]:
data.shape

(10000, 10)

In [8]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:

    """
    Configuration class for data validation operations.
    
    Attributes:
        root_dir: Directory where validation artifacts will be stored
        status_file: Name of the status file to track validation results
        unzip_data_dir: Directory containing the unzipped data to validate
        all_schema: Dictionary containing the expected data schema
    """
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [9]:
from src.datascience.constants import * 
from src.datascience.utils.common import read_yaml, create_directories
from src.datascience import logger
from src.datascience.config.configuration import DataIngestionConfig

class ConfigurationManager:
    """
    Configuration manager for handling YAML configuration files.
    
    This class loads configuration, parameters, and schema files and provides
    methods to retrieve specific configuration objects.
    """
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath= PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH ):

        """
        Initialize the ConfigurationManager.
        
        Args:
            config_filepath (Path): Path to the main configuration file
            params_filepath (Path): Path to the parameters file
            schema_filepath (Path): Path to the schema file
        """
          
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            
            # Create artifacts root directory
            create_directories([self.config.artifacts_root])
            logger.info("ConfigurationManager initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing ConfigurationManager: {e}")
            raise


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        return data_ingestion_config
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir =  config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema = schema
        )
        return data_validation_config

[2025-08-02 15:13:23,905: INFO: __init__: Logger initialized for the datascience package.]


In [10]:
import os
from src.datascience import logger

class DataValidation:
    """
    Data validation component for validating dataset schema and structure.
    """
    
    def __init__(self, config: DataValidationConfig):
        """
        Initialize DataValidation with configuration.
        
        Args:
            config (DataValidationConfig): Configuration object for data validation
        """
        self.config = config
        logger.info("DataValidation component initialized")


    def validate(self)-> bool:
        """
        Validate that all columns in the dataset match the expected schema.
        
        Returns:
            bool: True if validation passes, False otherwise
            
        Raises:
            FileNotFoundError: If the data file doesn't exist
            Exception: If there's an error during validation
        """

        try:
            validation_status = True
            missing_columns = []
            extra_columns = []

            # Read the CSV file
            logger.info(f"Reading CSV file from: {self.config.unzip_data_dir}")

            data = pd.read_csv(self.config.unzip_data_dir)
            actual_columns = list(data.columns)

            expected_columns = self.config.all_schema.keys()

            # Check for missing columns
            missing_columns = [col for col in expected_columns if col not in actual_columns]
            
            # Check for extra columns (found but not expected)
            extra_columns = [col for col in actual_columns if col not in expected_columns]

            if missing_columns:
                validation_status = False
                logger.error(f"Missing columns: {missing_columns}")
            
            if extra_columns:
                logger.warning(f"Extra columns found: {extra_columns}")
                validation_status = False

            status_message = f"Validation status: {validation_status}"

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(status_message)

            logger.info(f"Validation completed. Status: {validation_status}")

            return validation_status
        except FileNotFoundError as e:
            logger.error(f"CSV file not found: {e}")
            self._write_error_status(f"File not found: {e}")
            raise
        except Exception as e:
            logger.error(f"Error during column validation: {e}")
            self._write_error_status(f"Validation error: {e}")
            raise


In [11]:
try:
    config= ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate()
except Exception as e:
    raise e

[2025-08-02 15:13:24,869: INFO: common: YAML file: config/config.yaml loaded successfully]
[2025-08-02 15:13:24,949: INFO: common: YAML file: params.yaml loaded successfully]
[2025-08-02 15:13:25,027: INFO: common: YAML file: schema.yaml loaded successfully]
[2025-08-02 15:13:25,028: INFO: common: Created directory at artifacts]
[2025-08-02 15:13:25,029: INFO: 4097994236: ConfigurationManager initialized successfully]
[2025-08-02 15:13:25,030: INFO: common: Created directory at artifacts/data_validation]
[2025-08-02 15:13:25,030: INFO: 1695794247: DataValidation component initialized]
[2025-08-02 15:13:25,031: INFO: 1695794247: Reading CSV file from: artifacts/data_ingestion/Employers_data.csv]
[2025-08-02 15:13:25,053: INFO: 1695794247: Validation completed. Status: True]
