In [13]:
import os

In [14]:
%pwd

'c:\\Users\\Yashar\\End-to-End-Employee-Classification-with-MLOPs'

In [3]:
os.chdir('../')

In [15]:
%pwd

'c:\\Users\\Yashar\\End-to-End-Employee-Classification-with-MLOPs'

## config_entity.py

In [16]:
# Updating the Entity 
# The same variables from the config.yaml is here like root_dir, unzip_dir, STATUS_FILE

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
  
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str
    all_schema: dict

## configuration.py

In [17]:
# Updating configuration manager in src config
# Next we need to read the Yaml file.then we will write some functions in the constants folder. we define that 3 functionto read the yaml files.
# after that we will import functions like read_yaml, create_directories from utils.common

from mlProject.constants import *
from mlProject.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath= CONFIG_FILE_PATH,
                 schema_filepath= SCHEMA_FILE_PATH,
                 params_filepath= PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            unzip_data_dir = config.unzip_data_dir,
            root_dir = config.root_dir,
            STATUS_FILE= config.STATUS_FILE,
            all_schema= schema

        )

        return data_validation_config

    

## components/data_validation.py

In [7]:
import pandas as pd
data = pd.read_csv('artifacts/data_ingestion/Employee.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [8]:
data.isnull().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [9]:
data.columns

Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
      dtype='object')

In [18]:
# Updating the components
import os
from mlProject import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            # Read the CSV file
            data = pd.read_csv(self.config.unzip_data_dir)
            
            # Get columns from CSV and schema
            all_cols = set(data.columns)
            schema_cols = set(self.config.all_schema.keys())
            
            # Check if columns match exactly
            validation_status = (all_cols == schema_cols)
            
            # If columns match, check data types
            if validation_status:
                for col in all_cols:
                    expected_dtype = self.config.all_schema[col]
                    actual_dtype = data.dtypes[col].name
                    if actual_dtype != expected_dtype:
                        validation_status = False
                        break
            
            # Write validation status to file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")
            
            return validation_status
        
        except Exception as e:
            raise e
            


## pipeline/stage_02_data_validation.py

In [19]:
# Update the Pipeline

try:
    config = ConfigurationManager() # configuration manager
    data_validation_config = config.get_data_validation_config() # configuration manager
    data_validation = DataValidation(config=data_validation_config) # components
    data_validation.validate_all_columns() # components
   
except Exception as e:
    raise e

[2025-07-12 15:52:38,969: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-12 15:52:38,971: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-12 15:52:38,973: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-12 15:52:38,974: INFO: common: created directory at: artifacts]
[2025-07-12 15:52:38,975: INFO: common: created directory at: artifacts/data_validation]
