In [24]:
import os

In [25]:
os.chdir('../')

## Traning Pipline Config

In [26]:
import numpy as np
import pandas as pd

In [27]:
from networksecurity.constant import traning_pipline


## Config Entity

In [28]:
from datetime import datetime

class TraningPiplineConfig:
    def __init__(self) -> None:
      self.pipline_name=traning_pipline.PIPELINE_NAME   
      self.artifact_name=traning_pipline.ARTIFACT_DIR
      self.artifact_dir=os.path.join(self.artifact_name)
      
class DataValidationConfig:
    def __init__(self,traning_pipline_config:TraningPiplineConfig) -> None:
      self.data_validation_dir:str=os.path.join(
      	traning_pipline_config.artifact_dir,traning_pipline.DATA_VALIDATION_DIR_NAME ## crating data validaton folder inside artifacts
		)
      self.valid_dir_name:str=os.path.join(
         self.data_validation_dir, traning_pipline.DATA_VALIDATION_VALID_DIR ## validated report folder inside data validation folder
		)
      self.invalid_dir_name:str=os.path.join(
         self.data_validation_dir,traning_pipline.DATA_VALIDATION_INVALID_DIR ## invalid report folder inside data validation folder
		)
      self.drift_report_dir:str=os.path.join(
         self.data_validation_dir,traning_pipline.DATA_VALIDATION_DRIFT_REPORT_DIR,traning_pipline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME # data validation dir, drift report dir, report name
		)
      self.valid_traning_data_store_path:str=os.path.join(
         traning_pipline_config.artifact_dir, traning_pipline.DATA_INGESTION__DIR, traning_pipline.TRAIN_FILE_NAME  ## artifacts folder , ingest folder , train data path
		)
      self.valid_test_data_store_path:str=os.path.join(
         traning_pipline_config.artifact_dir,traning_pipline.DATA_INGESTION__DIR, traning_pipline.TEST_FILE_NAME ## artifacts folder , ingest folder , test data path
		)
      self.invalid_traning_data_store_path:str=os.path.join(
         traning_pipline_config.artifact_dir, traning_pipline.DATA_INGESTION__DIR, traning_pipline.TRAIN_FILE_NAME  ## artifacts folder , ingest folder , train data path
		)
      self.invalid_test_data_store_path:str=os.path.join(
         traning_pipline_config.artifact_dir,traning_pipline.DATA_INGESTION__DIR, traning_pipline.TEST_FILE_NAME ## artifacts folder , ingest folder , test data path
		)

## Components Output Artifacts Entity

In [29]:
from dataclasses import dataclass
@dataclass
class DataValidationArtifact:
    validation_status:bool
    valid_train_path:str
    valid_test_path:str
    invalid_train_path:str
    invalid_test_path:str
    drift_report_path:str

## Data Validation Component

In [31]:
from networksecurity.logging.logger import logging
from networksecurity.exception.exception import CustomException
import os
import sys
import numpy as np
import pandas as pd
from typing import List
from scipy.stats import ks_2samp
from networksecurity.utils.utills import read_yaml,write_yaml_file
from networksecurity.logging.logger import logging
from networksecurity.exception.exception import CustomException
from networksecurity.entity.artifact_entity import DataIngestionArtifact

ImportError: cannot import name 'write_yaml_file' from 'networksecurity.utils.utills' (d:\MLOPS\NetWork Security\networksecurity\utils\utills.py)

In [None]:
class DataValidation:
    def __init__(self, training_pipeline_config: DataValidationConfig, data_ingestion_artifacts: DataIngestionArtifact) -> None:
        self.data_ingestion_artifacts = data_ingestion_artifacts
        self.data_validation_config = training_pipeline_config
        schema_file_path = os.path.join(training_pipeline_config.SCHEMA_FILE_DIR, training_pipeline_config.SCHEMA_FILE_NAME)
        self.schema_config = read_yaml(schema_file_path)

    @staticmethod
    def read_data(filepath: str) -> pd.DataFrame:
        try:
            return pd.read_csv(filepath)
        except Exception as e:
            raise CustomException(e, sys)

    def validate_num_of_cols(self, df: pd.DataFrame) -> bool:
        try:
            expected_num_of_cols = len(self.schema_config['columns'])
            actual_num_of_cols = len(df.columns)
            logging.info(f'Expected number of columns: {expected_num_of_cols}')
            logging.info(f'Actual number of columns: {actual_num_of_cols}')

            return expected_num_of_cols == actual_num_of_cols
        except Exception as e:
            raise CustomException(e, sys)

    def detect_data_drift(self, base_df: pd.DataFrame, current_df: pd.DataFrame, threshold: float = 0.05) -> bool:
        try:
            status = True
            report = {}

            for col in base_df.columns:
                base_data = base_df[col]
                current_data = current_df[col]
                drift_test = ks_2samp(data1=base_data, data2=current_data)
                is_drifted = drift_test.pvalue < threshold
                report[col] = {
                    "p_value": float(drift_test.pvalue),
                    "drift_status": is_drifted
                }
                if is_drifted:
                    status = False

            drift_report_path = self.data_validation_config.drift_report_dir
            os.makedirs(os.path.dirname(drift_report_path), exist_ok=True)
            write_yaml_file(file_path=drift_report_path, content=report)

            logging.info('Data drift detection completed and report generated.')
            return status
        except Exception as e:
            raise CustomException(e, sys)

    def initiate_data_validation(self) -> DataValidationArtifact:
        try:
            train_file_path = self.data_ingestion_artifacts.trained_file_path
            test_file_path = self.data_ingestion_artifacts.test_file_path

            # Read training and test data
            train_df = self.read_data(train_file_path)
            test_df = self.read_data(test_file_path)
            logging.info('Data read from training and test files completed.')

            # Validate number of columns
            if not self.validate_num_of_cols(df=train_df):
                raise CustomException("Training data does not contain the expected columns.", sys)
            if not self.validate_num_of_cols(df=test_df):
                raise CustomException("Testing data does not contain the expected columns.", sys)

            # Check for data drift
            drift_status = self.detect_data_drift(base_df=train_df, current_df=test_df)

            # Save validated data
            os.makedirs(os.path.dirname(self.data_validation_config.valid_training_data_store_path), exist_ok=True)
            train_df.to_csv(self.data_validation_config.valid_training_data_store_path, index=False, header=True)
            test_df.to_csv(self.data_validation_config.valid_test_data_store_path, index=False, header=True)

            # Create DataValidationArtifact
            data_validation_artifact = DataValidationArtifact(
                validation_status=drift_status,
                valid_train_path=self.data_validation_config.valid_training_data_store_path,
                valid_test_path=self.data_validation_config.valid_test_data_store_path,
                invalid_train_path=None,
                invalid_test_path=None,
                drift_report_path=self.data_validation_config.drift_report_dir
            )

            logging.info('Data validation completed successfully.')
            return data_validation_artifact
        except Exception as e:
            raise CustomException(e, sys)


## Execute Pipline

In [None]:
from networksecurity.components.data_ingestion import DataIngestion
from networksecurity.entity.artifact_entity import DataIngestionArtifact
from networksecurity.entity.config_entity import DataIngestionConfig

In [None]:
try:
   traning_pipline_config=TraningPiplineConfig()
   data_ingestion_config=DataIngestion(data_ingestion_config=traning_pipline_config)
   data_ingestion_config=DataIngestionConfig(traning_pipline_config=traning_pipline_config)
   data_ingestion=DataIngestion(data_ingestion_config=data_ingestion_config)
   data_ingestion_artifacts=data_ingestion.initiate_data_ingestion()
   
   data_validation_config=DataValidationConfig(traning_pipline_config=traning_pipline_config)
   data_validation=DataValidation(training_pipeline_config=data_validation_config,data_ingestion_artifacts=data_ingestion_artifacts)
   data_ingestion.initiate_data_ingestion()
except Exception as e:
           raise CustomException(e,sys)

   having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   Domain_registeration_length  Favicon  ...  popUpWidnow  Iframe  \

CustomException: Error occurred python script name [C:\Users\www58\AppData\Local\Temp\ipykernel_23828\2982643942.py] line number [8] error message [module 'networksecurity.constant.traning_pipline' has no attribute 'DATA_VALIDATION_DIR_NAME']