In [1]:
import os

In [2]:
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction\\research'

In [4]:
# os.chdir("Bank Churn Prediction")
os.chdir("../")

In [5]:
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction'

In [6]:
DATA_PATH = 'artifacts/data_ingestion/Churn_Modelling.csv'

In [7]:
import pandas as pd

In [8]:
data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
data.shape

(10000, 14)

In [12]:
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [13]:
data.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

#### 4. Update Entity

In [1]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str    
    all_schema: dict
    unzip_dir: Path

#### 5. Update Config manager

In [4]:
import os
os.chdir("../")

In [5]:
from src.BankChurn.constants import *
from src.BankChurn.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        self.config = None
        self.params = None
        self.schema = None
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
            unzip_dir=config.unzip_dir 
            
        )

        return data_validation_config

#### 6. Update Components

In [44]:
import os
import pandas as pd
import pickle
from src.BankChurn import logger
from sklearn.preprocessing import LabelEncoder

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        #self.data_path = data_path
        self.config = config
        logger.info('Initialized DataValidation Class')
        

    def load_data(self):
        logger.info('Loading data from {}'.format(self.config.unzip_data_dir))
        self.df = pd.read_csv(self.config.unzip_data_dir)
        logger.info(f"Loaded data from {self.config.unzip_data_dir}")
        logger.info('Loaded data with shape {}'.format(self.df.shape))
        return self.df

    def preprocess(self):
        """ Preprocess the dataframe """
        self.load_data()

        # Drop unnecessary columns
        columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
        for col in columns_to_drop:
            if col in self.df.columns:
                self.df = self.df.drop(columns=[col])
                logger.info(f"Dropped column {col}")

        # Convert columns to integer type
        for col in ['EstimatedSalary', 'Balance']:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(int, errors='ignore')
                logger.info(f"Converted column {col} to integer type")

        # Encode categorical columns using LabelEncoder
        le = LabelEncoder()
        categorical_cols = ['Geography', 'Gender']
        for col in categorical_cols:
            if col in self.df.columns:
                self.df[col] = le.fit_transform(self.df[col])
                logger.info(f"Encoded categorical column {col}")

        # Drop additional column
        if 'HasCrCard' in self.df.columns:
            self.df = self.df.drop('HasCrCard', axis=1)
            logger.info("Dropped column HasCrCard")

        # Export preprocessed data
        validated_data_path = os.path.join('artifacts/data_preprocessed', 'preprocessed_data.pkl')
        os.makedirs(os.path.dirname(validated_data_path), exist_ok=True)
        self.df.to_pickle(validated_data_path , protocol=pickle.HIGHEST_PROTOCOL)
        logger.info(f"Exported preprocessed data to {validated_data_path}")

        return self.df

    def validate_columns(self, preprocessed_data: pd.DataFrame) -> bool:
        try:
            validation_status = True
            all_cols = list(preprocessed_data.columns)
            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    logger.error(f"Column {col} not found in schema")
                else:
                    schema_data_type = self.config.all_schema[col]
                    data_data_type = preprocessed_data[col].dtype

                    if schema_data_type != data_data_type:
                        validation_status = False
                        logger.error(f"Data type mismatch for column {col}: schema type {schema_data_type}, data type {data_data_type}")
            
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status
        except Exception as e:
            logger.exception(f"Error in validating columns: {e}")
            raise e
        
    
    def validate_data(self):
        preprocessed_data = self.preprocess()
        validation_status = self.validate_columns(preprocessed_data)
        logger.info(f"Validation status: {validation_status}")
        return validation_status

#### 7. Update Pipeline

In [45]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    #preprocessed_data = data_validation.preprocess()
    validation_status = data_validation.validate_data()
    print(f"Validation status: {validation_status}")
except Exception as e:
    logger.error(f"Error occurred: {e}")
    raise e

[2024-06-05 19:32:41,428: 31 - Bank Churn Project Logger: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-05 19:32:41,432: 31 - Bank Churn Project Logger: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-05 19:32:41,439: 31 - Bank Churn Project Logger: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-05 19:32:41,441: 51 - Bank Churn Project Logger: INFO: common: created directory at: artifacts]
[2024-06-05 19:32:41,444: 51 - Bank Churn Project Logger: INFO: common: created directory at: artifacts/data_validation]
[2024-06-05 19:32:41,445: 11 - Bank Churn Project Logger: INFO: 3070188584: Initialized DataValidation Class]
[2024-06-05 19:32:41,449: 15 - Bank Churn Project Logger: INFO: 3070188584: Loading data from artifacts/data_ingestion/Churn_Modelling.csv]
[2024-06-05 19:32:41,482: 17 - Bank Churn Project Logger: INFO: 3070188584: Loaded data from artifacts/data_ingestion/Churn_Modelling.csv]
[2024-06-05 19:32:41,484: 18 -

[2024-06-05 19:32:41,498: 30 - Bank Churn Project Logger: INFO: 3070188584: Dropped column Surname]
[2024-06-05 19:32:41,504: 36 - Bank Churn Project Logger: INFO: 3070188584: Converted column EstimatedSalary to integer type]
[2024-06-05 19:32:41,524: 36 - Bank Churn Project Logger: INFO: 3070188584: Converted column Balance to integer type]
[2024-06-05 19:32:41,536: 44 - Bank Churn Project Logger: INFO: 3070188584: Encoded categorical column Geography]
[2024-06-05 19:32:41,543: 44 - Bank Churn Project Logger: INFO: 3070188584: Encoded categorical column Gender]
[2024-06-05 19:32:41,551: 49 - Bank Churn Project Logger: INFO: 3070188584: Dropped column HasCrCard]
[2024-06-05 19:32:41,557: 55 - Bank Churn Project Logger: INFO: 3070188584: Exported preprocessed data to artifacts/data_preprocessed\preprocessed_data.pkl]
[2024-06-05 19:32:41,560: 86 - Bank Churn Project Logger: INFO: 3070188584: Validation status: True]
Validation status: True
