In [2]:
import os

In [3]:
%pwd

'd:\\Sem_6\\6_Intel\\00\\Intel-MLflow-Project\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'd:\\Sem_6\\6_Intel\\00\\Intel-MLflow-Project'

In [6]:
import pandas as pd

In [21]:
# EDA: Load the extracted CSV
df = pd.read_csv("Dairy_Supply_Demand_20000.csv", parse_dates=['Date'])
# Data type validation
schema = {  # Replace with your actual schema loading if needed
    'Date': 'datetime64[ns]',
    'Milk_Supply_Liters': 'int64',
    'Downtime_Hours': 'float64',
    'Milk_500ml_Demand': 'int64',
    'Milk_500ml_Inventory': 'int64',
    'Milk_1L_Demand': 'int64',
    'Milk_1L_Inventory': 'int64',
    'Butter_Demand': 'int64',
    'Butter_Inventory': 'int64',
    'Cheese_Demand': 'int64',
    'Cheese_Inventory': 'int64',
    'Yogurt_Demand': 'int64',
    'Yogurt_Inventory': 'int64'
}

for col, dtype in schema.items():
    if col in df.columns:
        if str(df[col].dtype) != dtype:
            print(f"Type mismatch for {col}: expected {dtype}, got {df[col].dtype}")
df.head()

Unnamed: 0,Date,Milk_Supply_Liters,Downtime_Hours,Milk_500ml_Demand,Milk_500ml_Inventory,Milk_1L_Demand,Milk_1L_Inventory,Butter_Demand,Butter_Inventory,Cheese_Demand,Cheese_Inventory,Yogurt_Demand,Yogurt_Inventory
0,2020-01-01,10025,0.4,2137,2677,1891,2256,2816,3386,3523,4055,3989,4738
1,2020-01-02,10276,0.05,2115,2641,2633,2962,2620,3242,3081,3253,3910,4205
2,2020-01-03,8487,2.82,1458,2052,2128,2266,3193,3449,2992,3441,4276,4602
3,2020-01-04,9260,0.59,1684,1827,2689,3319,2871,3480,3419,4252,4262,4811
4,2020-01-05,9549,1.71,1933,2479,2535,2893,3159,3667,3764,3961,4314,4959


In [22]:
# Date uniqueness and sorting
if not df['Date'].is_monotonic_increasing:
    print("Warning: Dates are not sorted!")
if df['Date'].duplicated().any():
    print("Warning: Duplicate dates found!")

In [23]:
# Missing value handling
if df.isnull().any().any():
    print("Missing values detected! Consider handling them before modeling.")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  20000 non-null  datetime64[ns]
 1   Milk_Supply_Liters    20000 non-null  int64         
 2   Downtime_Hours        20000 non-null  float64       
 3   Milk_500ml_Demand     20000 non-null  int64         
 4   Milk_500ml_Inventory  20000 non-null  int64         
 5   Milk_1L_Demand        20000 non-null  int64         
 6   Milk_1L_Inventory     20000 non-null  int64         
 7   Butter_Demand         20000 non-null  int64         
 8   Butter_Inventory      20000 non-null  int64         
 9   Cheese_Demand         20000 non-null  int64         
 10  Cheese_Inventory      20000 non-null  int64         
 11  Yogurt_Demand         20000 non-null  int64         
 12  Yogurt_Inventory      20000 non-null  int64         
dtypes: datetime64[ns

In [11]:
df.isnull().sum()

Date                    0
Milk_Supply_Liters      0
Downtime_Hours          0
Milk_500ml_Demand       0
Milk_500ml_Inventory    0
Milk_1L_Demand          0
Milk_1L_Inventory       0
Butter_Demand           0
Butter_Inventory        0
Cheese_Demand           0
Cheese_Inventory        0
Yogurt_Demand           0
Yogurt_Inventory        0
dtype: int64

In [12]:
df.shape

(20000, 13)

In [13]:
df.columns

Index(['Date', 'Milk_Supply_Liters', 'Downtime_Hours', 'Milk_500ml_Demand',
       'Milk_500ml_Inventory', 'Milk_1L_Demand', 'Milk_1L_Inventory',
       'Butter_Demand', 'Butter_Inventory', 'Cheese_Demand',
       'Cheese_Inventory', 'Yogurt_Demand', 'Yogurt_Inventory'],
      dtype='object')

In [14]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [15]:
from MLproject.constants import *
from MLproject.utils.common import read_yaml, create_directories

In [16]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [17]:
import os
from MLproject import logger

In [24]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True
            data = pd.read_csv(self.config.unzip_data_dir, parse_dates=['Date'])
            all_cols = list(data.columns)
            all_schema = self.config.all_schema

            # 1. Column name check
            for col in all_cols:
                if col not in all_schema:
                    print(f"Unexpected column: {col}")
                    validation_status = False

            for col in all_schema:
                if col not in all_cols:
                    print(f"Missing column: {col}")
                    validation_status = False

            # 2. Data type check
            for col, dtype in all_schema.items():
                if col in data.columns:
                    if str(data[col].dtype) != dtype:
                        print(f"Type mismatch for {col}: expected {dtype}, got {data[col].dtype}")
                        validation_status = False

            # 3. Non-negative value check
            for col in [
                'Milk_Supply_Liters', 'Milk_500ml_Demand', 'Milk_500ml_Inventory',
                'Milk_1L_Demand', 'Milk_1L_Inventory', 'Butter_Demand', 'Butter_Inventory',
                'Cheese_Demand', 'Cheese_Inventory', 'Yogurt_Demand', 'Yogurt_Inventory'
            ]:
                if col in data.columns and (data[col] < 0).any():
                    print(f"Negative values found in {col}")
                    validation_status = False

            # 4. Outlier detection (simple: values beyond 3 std devs)
            for col in [
                'Milk_Supply_Liters', 'Downtime_Hours', 'Milk_500ml_Demand', 'Milk_1L_Demand',
                'Butter_Demand', 'Cheese_Demand', 'Yogurt_Demand'
            ]:
                if col in data.columns:
                    outliers = data[(data[col] - data[col].mean()).abs() > 3 * data[col].std()]
                    if not outliers.empty:
                        print(f"Outliers detected in {col}: {len(outliers)} rows")

            # 5. Date uniqueness and sorting
            if 'Date' in data.columns:
                if not data['Date'].is_monotonic_increasing:
                    print("Warning: Dates are not sorted!")
                    validation_status = False
                if data['Date'].duplicated().any():
                    print("Warning: Duplicate dates found!")
                    validation_status = False

            # 6. Missing value check
            if data.isnull().any().any():
                print("Missing values detected! Consider handling them before modeling.")
                validation_status = False

            # Write status to file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            raise e

In [20]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-07-03 00:52:59,667: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-03 00:52:59,672: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-03 00:52:59,673: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-03 00:52:59,675: INFO: common: created directory at: artifacts]
[2025-07-03 00:52:59,676: INFO: common: created directory at: artifacts/data_validation]
