In [1]:
import os

In [2]:
%pwd

'f:\\MLOps\\DataScienceProjectWithDeployment\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'f:\\MLOps\\DataScienceProjectWithDeployment'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [8]:
import os
from src.datascience import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [9]:
data=pd.read_csv("artifacts/data_ingestion/credit_risk_dataset.csv")
print(data.shape)
data.head()

(32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [10]:
pd.crosstab(data['loan_intent'], data['loan_grade'])

loan_grade,A,B,C,D,E,F,G
loan_intent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DEBTCONSOLIDATION,1753,1620,1042,600,144,43,10
EDUCATION,2174,2026,1356,657,185,46,9
HOMEIMPROVEMENT,1126,1115,693,487,143,32,9
MEDICAL,1959,2006,1217,658,167,52,12
PERSONAL,1832,1813,1046,643,147,30,10
VENTURE,1933,1871,1104,581,178,38,14


In [11]:
data['cb_person_default_on_file'].value_counts(dropna=False)

cb_person_default_on_file
N    26836
Y     5745
Name: count, dtype: int64

In [12]:
mapping_home_ownership = {'RENT': 1, 'MORTGAGE': 2, 'OWN': 3, 'OTHER': 0}
mapping_loan_grade = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6, "G":6}
mapping_cb_person_default_on_file = {"N": 1, "Y": 0}

data['person_home_ownership'] = data['person_home_ownership'].map(mapping_home_ownership).fillna(0)
data['loan_grade'] = data['loan_grade'].map(mapping_loan_grade).fillna(0)
data['cb_person_default_on_file'] = data['cb_person_default_on_file'].map(mapping_home_ownership).fillna(0)

In [13]:
data = pd.get_dummies(data, columns=['loan_intent'], drop_first=True)

In [14]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22,59000,1,123.0,4,35000,16.02,1,0.59,0.0,3,False,False,False,True,False
1,21,9600,3,5.0,2,1000,11.14,0,0.10,0.0,2,True,False,False,False,False
2,25,9600,2,1.0,3,5500,12.87,1,0.57,0.0,3,False,False,True,False,False
3,23,65500,1,4.0,3,35000,15.23,1,0.53,0.0,2,False,False,True,False,False
4,24,54400,1,8.0,3,35000,14.27,1,0.55,0.0,4,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,2,1.0,3,5800,13.16,0,0.11,0.0,30,False,False,False,True,False
32577,54,120000,2,4.0,1,17625,7.49,0,0.15,0.0,19,False,False,False,True,False
32578,65,76000,1,3.0,2,35000,10.99,1,0.46,0.0,28,False,True,False,False,False
32579,56,150000,2,5.0,2,15000,11.48,0,0.10,0.0,26,False,False,False,True,False


In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.data = None
    
    def categorical_to_numerical(self):
        data = pd.read_csv(self.config.data_path)
        mapping_home_ownership = {'RENT': 1, 'MORTGAGE': 2, 'OWN': 3, 'OTHER': 0}
        mapping_loan_grade = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6, "G":6}
        mapping_cb_person_default_on_file = {"N": 1, "Y": 0}

        data['person_home_ownership'] = data['person_home_ownership'].map(mapping_home_ownership).fillna(0)
        data['loan_grade'] = data['loan_grade'].map(mapping_loan_grade).fillna(0)
        data['cb_person_default_on_file'] = data['cb_person_default_on_file'].map(mapping_cb_person_default_on_file).fillna(0)

        # # one-hot incoding for loan_intent as can't specify order/let the model decide
        # data = pd.get_dummies(data, columns=['loan_intent'], drop_first=True)

        data = data.drop('loan_intent', axis=1)

        self.data = data  # make it available for other methods
        logger.info("Categorical variables are sucessfully converted to numerical.")

    def train_test_splitting(self):
        if self.data is None:
            raise ValueError("Data not found! Run categorical_to_numerical() first.")
        
        train, test = train_test_split(self.data, test_size=0.2, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [18]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.categorical_to_numerical()
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2025-10-19 19:39:01,575: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-19 19:39:01,582: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-19 19:39:01,590: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-19 19:39:01,597: INFO: common: Created directory at: artifacts]
[2025-10-19 19:39:01,599: INFO: common: Created directory at: artifacts/data_transformation]
[2025-10-19 19:39:01,773: INFO: 2683885595: Categorical variables are sucessfully converted to numerical.]
[2025-10-19 19:39:02,404: INFO: 2683885595: Splited data into training and test sets]
[2025-10-19 19:39:02,404: INFO: 2683885595: (26064, 16)]
[2025-10-19 19:39:02,404: INFO: 2683885595: (6517, 16)]
(26064, 16)
(6517, 16)


In [19]:
with open(Path("artifacts/data_validation/status.txt"),'r') as f:
                status=f.read().split(" ")[-1]
                print(status)

True


In [21]:
# to check
train_data=pd.read_csv("artifacts/data_transformation/train.csv")
print(train_data.shape)
train_data.head()

(26064, 16)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,64,46000,1,2.0,3,4800,11.09,0,0.1,0,24,False,False,False,True,False
1,26,26000,3,0.0,5,8500,16.45,1,0.33,1,3,False,False,False,False,False
2,23,51000,2,3.0,3,16000,13.11,0,0.31,0,3,False,False,False,True,False
3,22,56004,2,6.0,1,6000,7.88,0,0.11,1,4,False,False,True,False,False
4,24,79000,1,3.0,3,7000,12.54,0,0.09,1,3,False,False,False,True,False
