In [4]:
import os

In [5]:
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back\\research'

In [6]:
os.chdir("../")
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back'

In [7]:
import pandas as pd
import numpy as np

In [8]:
from dataclasses import dataclass
from pathlib import Path

In [9]:
@dataclass
class DataTransformationConfig:
    root_dir:Path
    data_dir:Path
    schema_file_path:Path
    
    

In [11]:
from src.loan_payment_prediction.constants import*
from src.loan_payment_prediction.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation  
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            schema_file_path=config.schema_file_path)
        
        
        return data_transformation_config
                


   

In [14]:
import os
from src.loan_payment_prediction import logger
from sklearn.model_selection import train_test_split


In [15]:
import os
import yaml
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler




In [22]:
class DataTransformation:

    def __init__(self, config):
        self.config = config
        self.pipeline = None
        self.target_column = self.load_target_column()

    # Load target column from schema
    def load_target_column(self) -> str:
        with open(self.config.schema_file_path, "r") as f:
            schema = yaml.safe_load(f)
        return schema['TARGET_COLUMN']["name"]

    # Load data
    def load_data(self) -> pd.DataFrame:
        df = pd.read_csv(self.config.data_dir)
        logger.info(f"Data loaded with shape {df.shape}")
        return df

    # Split features & target
    def split_features_target(self, df):
        target_column = self.target_column

        if target_column not in df.columns:
            raise ValueError(
                f"Target column '{target_column}' not found in dataset"
            )

        X = df.drop(columns=[target_column])
        y = df[target_column]

        return X, y

    # Build preprocessor pipeline
    def build_pipeline(self, X: pd.DataFrame):
        categorical_cols = X.select_dtypes(include="object").columns.tolist()
        numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

        logger.info(f"Categorical columns: {list(categorical_cols)}")
        logger.info(f"Numerical columns: {list(numerical_cols)}")

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", StandardScaler(), numerical_cols),
                ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
            ]
        )

        self.pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
        
        return categorical_cols, numerical_cols

    # Initiate data transformation

    def initiate_data_transformation(self):

        # 1. Load data
        data = self.load_data()

        # 2. Train-test split
        train, test = train_test_split(
        data, test_size=0.2, random_state=42
    )

        # 3. Split X & y
        X_train, y_train = self.split_features_target(train)
        X_test, y_test = self.split_features_target(test)
        logger.info(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    

        # 4. capture categorical and numerical columns
        categorical_cols, num_features = self.build_pipeline(X_train)
        
        
        # 5. Fit and transform data
        X_train = self.pipeline.fit_transform(X_train)
        X_test = self.pipeline.transform(X_test)

        # access columntransformer from pipeline
        preprocessor = self.pipeline.named_steps['preprocessor']
        ohe_encoder = preprocessor.named_transformers_['cat']
        ohe_cols = ohe_encoder.get_feature_names_out(categorical_cols)
        all_columns = num_features + list(ohe_cols)

         # Convert transformed arrays back to DataFrames

        train_transformed_df = pd.DataFrame(X_train, columns=all_columns)
        train_transformed_df[self.target_column] = y_train.values

        test_transformed_df = pd.DataFrame(X_test, columns=all_columns)
        test_transformed_df[self.target_column] = y_test.values

         # 6. Save transformed CSVs
        os.makedirs(self.config.root_dir, exist_ok=True)
        train_path = os.path.join(self.config.root_dir, "train_transformed.csv")
        test_path = os.path.join(self.config.root_dir, "test_transformed.csv")

        train_transformed_df.to_csv(train_path, index=False)
        test_transformed_df.to_csv(test_path, index=False)

        logger.info(f"Transformed train data saved at: {train_path}")
        logger.info(f"Transformed test data saved at: {test_path}")

        # 7. Save pipeline
        pipeline_path = os.path.join(
            self.config.root_dir, "pipeline.joblib"
        )
        joblib.dump(self.pipeline, pipeline_path)

        logger.info("Data transformation completed successfully")

        return (
            X_train,
            X_test,
            y_train.values,
            y_test.values
    )



In [23]:
try:
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()

    data_transformation = DataTransformation(
        config=data_transformation_config
    )

    X_train, X_test, y_train, y_test = (
        data_transformation.initiate_data_transformation()
    )

except Exception as e:
    logger.exception(e)
    raise e 


[2025-12-22 15:53:29,911]: INFO: YAML file config\config.yaml loaded successfully.
[2025-12-22 15:53:29,917]: INFO: YAML file params.yaml loaded successfully.
[2025-12-22 15:53:29,921]: INFO: YAML file schema.yaml loaded successfully.
[2025-12-22 15:53:29,924]: INFO: Directory created at: artifacts
[2025-12-22 15:53:29,927]: INFO: Directory created at: artifacts/data_transformation
[2025-12-22 15:53:30,722]: INFO: Data loaded with shape (593994, 13)
[2025-12-22 15:53:31,100]: INFO: Train shape: (475195, 12), Test shape: (118799, 12)
[2025-12-22 15:53:31,157]: INFO: Categorical columns: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
[2025-12-22 15:53:31,158]: INFO: Numerical columns: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
[2025-12-22 15:53:54,072]: INFO: Transformed train data saved at: artifacts/data_transformation\train_transformed.csv
[2025-12-22 15:53:54,073]: INFO: Trans