In [1]:
import os

In [None]:
%pwd

'd:\\HeartAttackAnalysis\\research'

In [3]:
os.chdir("../")
%pwd

'd:\\HeartAttackAnalysis'

In [4]:
import pandas as pd

df = pd.read_csv("D:/HeartAttackAnalysis/heart_attack_analysis.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       606 non-null    int64  
 1   sex       606 non-null    int64  
 2   cp        606 non-null    int64  
 3   trestbps  606 non-null    int64  
 4   chol      606 non-null    int64  
 5   fbs       606 non-null    int64  
 6   restecg   606 non-null    int64  
 7   thalach   606 non-null    int64  
 8   exang     606 non-null    int64  
 9   oldpeak   606 non-null    float64
 10  slope     606 non-null    int64  
 11  ca        606 non-null    int64  
 12  thal      606 non-null    int64  
 13  target    606 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 66.4 KB


In [None]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
df.shape

(606, 14)

In [8]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: Path
    DATA_FILE: Path
    all_schema: dict

In [9]:
from src.HeartAttackAnalysis.constants import *
from src.HeartAttackAnalysis.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManger:
    def __init__(
            self, 
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH
    ):
        self.config_filepath = read_yaml(config_filepath)
        self.params_filepath = read_yaml(params_filepath)
        self.schema_filepath = read_yaml(schema_filepath)

        create_directories([self.config_filepath.artifacts_root])
    
    def get_data_validation_config(self)->DataValidationConfig:
        config = self.config_filepath.data_validation
        schema = self.schema_filepath.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            DATA_FILE= config.DATA_FILE,
            all_schema = schema,
        )

        return data_validation_config

In [11]:
import os
from src.HeartAttackAnalysis.logging.logger import  logging
from src.HeartAttackAnalysis.exception.exception import HeartAttackAnalysisException

In [12]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self) -> bool:
        try:
            # Read the data file into a DataFrame
            df = pd.read_csv(self.config.DATA_FILE)
            all_cols = set(df.columns)  # Use a set for faster lookups
            all_schema_keys = set(self.config.all_schema.keys())

            # Check if all columns in the data file are in the schema
            missing_columns = all_cols - all_schema_keys
            extra_columns = all_schema_keys - all_cols

            # Determine validation status
            validation_status = len(missing_columns) == 0 and len(extra_columns) == 0

            # Write validation status and details to the status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_columns:
                    f.write(f"Missing columns: {', '.join(missing_columns)}\n")
                if extra_columns:
                    f.write(f"Extra columns: {', '.join(extra_columns)}\n")

            return validation_status
        except Exception as e:
            raise HeartAttackAnalysisException(e)

In [13]:
try:
    config = ConfigurationManger()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()

except Exception as e:
    raise HeartAttackAnalysisException(e)