In [1]:
import os
import pandas as pd

In [2]:
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ASUS\\Desktop\\loan-pay-back'

In [5]:
test=pd.read_csv("artifacts/raw_data/test.csv")

In [6]:
test.head(10)

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1
5,593999,45302.9,0.06,675,8106.78,13.74,Female,Married,High School,Employed,Vacation,C3
6,594000,27676.47,0.061,714,8242.26,13.87,Female,Single,High School,Employed,Debt consolidation,C4
7,594001,38216.91,0.095,719,3765.5,15.1,Male,Single,High School,Employed,Other,C5
8,594002,25650.59,0.101,664,20310.64,11.74,Male,Single,High School,Employed,Education,D4
9,594003,62497.03,0.207,651,5177.58,13.9,Female,Divorced,High School,Unemployed,Car,D2


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254569 entries, 0 to 254568
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    254569 non-null  int64  
 1   annual_income         254569 non-null  float64
 2   debt_to_income_ratio  254569 non-null  float64
 3   credit_score          254569 non-null  int64  
 4   loan_amount           254569 non-null  float64
 5   interest_rate         254569 non-null  float64
 6   gender                254569 non-null  object 
 7   marital_status        254569 non-null  object 
 8   education_level       254569 non-null  object 
 9   employment_status     254569 non-null  object 
 10  loan_purpose          254569 non-null  object 
 11  grade_subgrade        254569 non-null  object 
dtypes: float64(4), int64(2), object(6)
memory usage: 23.3+ MB


In [8]:
from dataclasses import dataclass
from pathlib import Path

In [22]:
@dataclass
class PredictionConfig:
    root_dir: str
    model_dir: str
    input_file: str
    output_file: str
    pipeline_path: str

In [27]:
from src.loan_payment_prediction.constants import*
from src.loan_payment_prediction.utils.common import read_yaml, create_directories, save_json  

In [24]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_prediction_config(self) -> PredictionConfig:
        config = self.config.prediction
        create_directories([config.root_dir])

        return PredictionConfig(
            root_dir=config.root_dir,
            model_dir=config.model_dir,
            input_file=config.input_file,
            output_file=config.output_file,
            pipeline_path=config.pipeline_path
            
        )
    


In [29]:
import joblib
import logging

In [32]:
import os
import pandas as pd
import joblib
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PredictionPipeline:
    def __init__(self, config: PredictionConfig):
        self.config = config

    def load_artifacts(self):
        logger.info("Loading saved model and preprocessing pipeline...")

    # Model check
        if not os.path.exists(self.config.model_dir):
            raise FileNotFoundError(f"Model not found at {self.config.model_dir}")

        # Pipeline check
        if not os.path.exists(self.config.pipeline_path):
            raise FileNotFoundError(f"Pipeline not found at {self.config.pipeline_path}")

        # Load artifacts
        self.model = joblib.load(self.config.model_dir)
        self.pipeline = joblib.load(self.config.pipeline_path)

        logger.info(f"Model loaded from: {self.config.model_dir}")
        logger.info(f"Pipeline loaded from: {self.config.pipeline_path}")


    def load_input_data(self):
        """Load new test data"""
        if not os.path.exists(self.config.input_file):
            raise FileNotFoundError(f"Input file not found at {self.config.input_file}")

        logger.info(f"Loading test data from {self.config.input_file}")
        df = pd.read_csv(self.config.input_file)

        logger.info(f"Test data shape: {df.shape}")
        return df

    def transform_data(self, df: pd.DataFrame):
        """Apply the SAME transformation used during training"""
        logger.info("Applying saved transformations to test data...")

        transformed_array = self.pipeline.transform(df)  # <-- this does OHE + scaling

        logger.info(f"Transformed test data shape: {transformed_array.shape}")
        return transformed_array

    def predict(self, transformed_data):
        """Run prediction"""
        logger.info("Generating predictions...")
        predictions = self.model.predict(transformed_data)
        return predictions

    def save_predictions(self, predictions, df):
        """Save results with id column"""
        os.makedirs(self.config.root_dir, exist_ok=True)

        # If your input file column is named "id"
        result_df = pd.DataFrame({
            "id": df["id"].values,
            "loan_paid_back": predictions
    })

        result_df.to_csv(self.config.output_file, index=False)
        logger.info(f"Predictions saved to {self.config.output_file}")

    def run(self):
        self.load_artifacts()
        df = self.load_input_data()
        transformed_data = self.transform_data(df)
        predictions = self.predict(transformed_data)
        self.save_predictions(predictions, df)

        return predictions


In [33]:
try:
    config_manager = ConfigurationManager()
    config = config_manager.get_prediction_config()

    pipeline = PredictionPipeline(config)
    predictions = pipeline.run()
    print("Done! Check predicted_output.csv")
    
except Exception as e:
    logger.exception("Error during prediction pipeline execution")
    raise e


[2025-12-29 13:41:48,570]: INFO: YAML file config\config.yaml loaded successfully.
[2025-12-29 13:41:48,603]: INFO: Directory created at: artifacts
[2025-12-29 13:41:48,612]: INFO: Directory created at: artifacts/prediction
[2025-12-29 13:41:48,629]: INFO: Loading saved model and preprocessing pipeline...
[2025-12-29 13:41:48,695]: INFO: Model loaded from: artifacts/model_trainer/model.joblib
[2025-12-29 13:41:48,697]: INFO: Pipeline loaded from: artifacts/data_transformation/pipeline.joblib
[2025-12-29 13:41:48,700]: INFO: Loading test data from artifacts/raw_data/test.csv
[2025-12-29 13:41:49,451]: INFO: Test data shape: (254569, 12)
[2025-12-29 13:41:49,452]: INFO: Applying saved transformations to test data...
[2025-12-29 13:41:50,458]: INFO: Transformed test data shape: (254569, 61)
[2025-12-29 13:41:50,461]: INFO: Generating predictions...




[2025-12-29 13:41:51,454]: INFO: Predictions saved to artifacts/prediction/predicted_output.csv
Done! Check predicted_output.csv
