In [1]:
import os

%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject/research'

In [2]:
os.chdir("../")
%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject'

In [3]:
os.chdir("/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject/")
%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject'

In [4]:
import pandas as pd

data=pd.read_csv("artifacts/data_ingestion/data.csv")
data.head()

Unnamed: 0,adulteration_id,product_name,brand,category,adulterant,detection_date,detection_method,severity,health_risk,action_taken
0,1,Butter,BrandB,Meat,Artificial sweeteners,5/11/2024,Microbiological Analysis,Moderate,Low,Product Recall
1,2,Chicken,BrandC,Dairy,Coloring agents,5/23/2024,Sensory Evaluation,Severe,Medium,Warning Issued
2,3,Yogurt,BrandC,Meat,Artificial sweeteners,2/17/2024,Sensory Evaluation,Severe,High,Investigation Launched
3,4,Wine,BrandB,Beverages,Coloring agents,5/16/2024,Spectroscopy,Minor,Medium,Product Recall
4,5,Bread,BrandD,Dairy,Water,6/6/2024,Chemical Analysis,Severe,Medium,Warning Issued


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   adulteration_id   1000 non-null   int64 
 1   product_name      1000 non-null   object
 2   brand             1000 non-null   object
 3   category          1000 non-null   object
 4   adulterant        1000 non-null   object
 5   detection_date    1000 non-null   object
 6   detection_method  1000 non-null   object
 7   severity          1000 non-null   object
 8   health_risk       1000 non-null   object
 9   action_taken      1000 non-null   object
dtypes: int64(1), object(9)
memory usage: 78.2+ KB


In [6]:
data.isnull().sum()

adulteration_id     0
product_name        0
brand               0
category            0
adulterant          0
detection_date      0
detection_method    0
severity            0
health_risk         0
action_taken        0
dtype: int64

In [7]:
data.shape

(1000, 10)

In [8]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
from src.datascience import logger
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

@dataclass
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: Path
    all_schema: dict
    data_path: Path  # Added data_path to config

In [9]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [10]:

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_FILE_PATH, 
        params_filepath=PARAMS_FILE_PATH, 
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        if not self.schema or "COLUMNS" not in self.schema:
            raise ValueError(f"Schema file at {schema_filepath} is missing or invalid. Ensure it contains a 'COLUMNS' key.")
        
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Creates and returns the data validation configuration
        """
        if "COLUMNS" not in self.schema:
            raise KeyError("The schema file does not contain a 'COLUMNS' key.")
        
        data_validation_config = DataValidationConfig(
            root_dir=Path(self.config.data_validation.root_dir),
            STATUS_FILE=Path(self.config.data_validation.STATUS_FILE),
            all_schema=self.schema.COLUMNS,
            data_path=Path(self.config.data_ingestion.local_data_file)  # Add data path from config
        )
        return data_validation_config



In [11]:
import os
from src.datascience import logger

In [12]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        self.validation_status = False
        
        # Ensure the validation directory exists
        create_directories([self.config.root_dir])

    def validate_all_columns(self) -> bool:
        """
        Validates that all required columns exist in the dataset
        Returns: bool indicating if validation passed
        """
        try:
            data = pd.read_csv(self.config.data_path)
            schema_cols = list(self.config.all_schema.keys())
            data_cols = list(data.columns)

            # Check if all required columns exist
            missing_cols = [col for col in schema_cols if col not in data_cols]
            
            if missing_cols:
                self.validation_status = False
                with open(self.config.STATUS_FILE, 'w') as f:
                    f.write(f"Validation status: {self.validation_status}\nMissing columns: {', '.join(missing_cols)}")
                return self.validation_status

            self.validation_status = True
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {self.validation_status}\nAll required columns present")
            return self.validation_status

        except Exception as e:
            logger.error(f"Error in validate_all_columns: {str(e)}")
            raise e

    def validate_schema(self, data: pd.DataFrame) -> bool:
        """
        Validates the data types of all columns against the schema
        Args:
            data: DataFrame to validate
        Returns: bool indicating if validation passed
        """
        try:
            self.validation_status = True
            validation_errors = []
            
            # Validate column data types
            for column, expected_type in self.config.all_schema.items():
                if expected_type == "int":
                    valid = pd.api.types.is_integer_dtype(data[column])
                elif expected_type == "str":
                    valid = pd.api.types.is_string_dtype(data[column])
                else:
                    valid = False
                    validation_errors.append(f"Unsupported type {expected_type} for column {column}")
                
                if not valid:
                    self.validation_status = False
                    validation_errors.append(f"Invalid datatype for {column}: expected {expected_type}")
            
            # Write validation results
            with open(self.config.STATUS_FILE, 'a') as f:
                f.write("\n=== Schema Validation Results ===\n")
                if validation_errors:
                    f.write("\n".join(validation_errors))
                f.write(f"\nSchema validation status: {self.validation_status}")
            
            return self.validation_status

        except Exception as e:
            logger.error(f"Error in validate_schema: {str(e)}")
            raise e


In [15]:
try:
        logger.info("Starting data validation")
        config_manager = ConfigurationManager()
        data_validation_config = config_manager.get_data_validation_config()
        data_validation = DataValidation(data_validation_config)
        
        # Validate columns
        logger.info("Validating columns...")
        columns_valid = data_validation.validate_all_columns()
        if not columns_valid:
            raise ValueError("Column validation failed")
            
        # Validate schema
        logger.info("Validating schema...")
        data = pd.read_csv(data_validation_config.data_path)
        schema_valid = data_validation.validate_schema(data)
        if not schema_valid:
            raise ValueError("Schema validation failed")
            
        logger.info("Data validation completed successfully")
        
except Exception as e:
        logger.error(f"Data validation failed: {str(e)}")
        raise e

[2025-01-08 09:28:38,684: INFO: 1615405803: Starting data validation]
[2025-01-08 09:28:38,688: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-01-08 09:28:38,693: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-08 09:28:38,696: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-08 09:28:38,698: INFO: common: created directory at: artifacts]
[2025-01-08 09:28:38,699: INFO: common: created directory at: artifacts/data_validation]
[2025-01-08 09:28:38,700: INFO: 1615405803: Validating columns...]
[2025-01-08 09:28:38,707: INFO: 1615405803: Validating schema...]
[2025-01-08 09:28:38,713: INFO: 1615405803: Data validation completed successfully]
