In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back'

In [5]:
import pandas as pd 
data = pd.read_csv('artifacts/raw_data/train.csv')

In [6]:
data.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [7]:
data.columns

Index(['id', 'annual_income', 'debt_to_income_ratio', 'credit_score',
       'loan_amount', 'interest_rate', 'gender', 'marital_status',
       'education_level', 'employment_status', 'loan_purpose',
       'grade_subgrade', 'loan_paid_back'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442236,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [10]:
data.isna().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [11]:
data.shape

(593994, 13)

In [12]:
from dataclasses import dataclass
from pathlib import Path

In [13]:
@dataclass
class DataValidationConfig:
    root_dir:Path
    data_dir:Path
    status_file:Path
    all_schema:dict
    

In [14]:
from src.loan_payment_prediction.constants import*
from src.loan_payment_prediction.utils.common import read_yaml, create_directories

In [15]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
    
        create_directories([config.root_dir])
    
        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            data_dir=Path(config.data_dir),
            status_file=Path(config.status_file),
            all_schema=schema
    )
    
        return data_validation_config

        
        
    

In [16]:
import os
from src.loan_payment_prediction import logger   

In [17]:
class DataValidation:
    def __init__(self, config: DataValidationConfig ):
        self.config = config
         

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True
            
            # Load dataset
            data = pd.read_csv(self.config.data_dir)
            all_columns = list(data.columns)
            
            # Schema keys
            required_columns = self.config.all_schema.keys()
            
            # Validate each column
            for column in required_columns:
                if column not in all_columns:
                    validation_status = False
                    with open(self.config.status_file, "a") as f: 
                        f.write(f"Column {column} is missing in the data\n")
                    logger.info(f"Column {column} is missing in the data")
                    
            return validation_status
        
        except Exception as e:
            logger.exception(e)
            raise e

In [18]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    
    validator= DataValidation(data_validation_config)
    
    status = validator.validate_all_columns()
    print(f"Data validation status: {status}")  
    
except Exception as e:
    raise e

[2025-12-11 09:20:31,963]: INFO: YAML file config\config.yaml loaded successfully.
[2025-12-11 09:20:31,966]: INFO: YAML file params.yaml loaded successfully.
[2025-12-11 09:20:31,972]: INFO: YAML file schema.yaml loaded successfully.
[2025-12-11 09:20:31,976]: INFO: Directory created at: artifacts
[2025-12-11 09:20:31,978]: INFO: Directory created at: artifacts/data_validation
Data validation status: True
