# MODEL 2 FEATURES ENGINEERING : 

In [33]:
# ==============================================
# Model 2 Feature Engineering â€“ Loan Amount Regression with Log-Transform
# Corrected version
# ==============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Load dataset
df = pd.read_csv("D:/LOAN APP PROJECT/data/cleaned_loan_approval_dataset.csv")

# Step 2: Filter only approved loans
df_approved = df[df['loan_status'] == 'approved'].copy()

# Step 3: Define target and feature columns
target = 'loan_amount'

# Exclude target and loan_status from features
feature_cols = df_approved.drop(columns=[target, 'loan_status', 'loan_id']).columns.tolist()

# Step 4: Create engineered features
# Total assets
asset_cols = ['residential_assets_value_clean', 'commercial_assets_value',
              'luxury_assets_value', 'bank_asset_value']
df_approved['total_assets'] = df_approved[asset_cols].sum(axis=1)

# Add engineered column to feature_cols
feature_cols.append('total_assets')

# Identify numeric and categorical features
numeric_cols = df_approved[feature_cols].select_dtypes(include=np.number).columns.tolist()
categorical_cols = df_approved[feature_cols].select_dtypes(include='object').columns.tolist()

# Step 5: Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Step 6: Fit transform features
X = preprocessor.fit_transform(df_approved[feature_cols])

# Step 7: Log-transform target
y_original = df_approved[target].values
y_log = np.log1p(y_original)  # log(loan_amount + 1)

# Step 8: Convert back to DataFrame (optional)
cat_features = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
all_features = numeric_cols + cat_features.tolist()
df_model2_features = pd.DataFrame(X, columns=all_features)

# Save both original and log-transformed target
df_model2_features['loan_amount'] = y_original
df_model2_features['loan_amount_log'] = y_log

# Step 9: Save dataset
df_model2_features.to_csv("D:/LOAN APP PROJECT/data/df_model2_features.csv", index=False)
df_model2_features.to_csv("df_model2_features.csv", index=False)
print("Model 2 feature dataset with log-transformed target saved successfully!")


Model 2 feature dataset with log-transformed target saved successfully!


In [38]:
df_model2_features = pd.read_csv("df_model2_features.csv")
df_model2_features = df_model2_features.drop(columns=['Unnamed: 0'], errors='ignore')

In [39]:
print(df_model2_features.shape)
df_model2_features.head(2)

(12559, 22)


Unnamed: 0,no_of_dependents,income_annum,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,residential_assets_value_clean,total_assets,...,self_employed_no,self_employed_yes,income_bin_high,income_bin_low,income_bin_mid,loan_bin_high,loan_bin_low,loan_bin_mid,loan_amount,loan_amount_log
0,-0.290982,1.583968,0.20908,0.73364,-0.771562,2.919512,0.848391,0.937558,-0.79039,0.953694,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,29900000,17.213369
1,1.480728,1.265503,-1.17695,0.093466,2.352682,2.264235,1.554831,-0.197584,2.364595,2.001098,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,33000000,17.312018


# MODEL 2 : MODEL DEVELOPMENT CODE : 

In [54]:
# ==============================================
# Model 2 Development â€“ Loan Amount Regression
# Models: RandomForestRegressor + LinearRegression
# Dataset: df_model2_features.csv
# ==============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# -----------------------------
# Step 1: Load Model 2 features
# -----------------------------
df = pd.read_csv("D:/LOAN APP PROJECT/data/df_model2_features.csv")
print("âœ… Dataset shape:", df.shape)

# -----------------------------
# Step 2: Prepare features (X) and target (y)
# -----------------------------
drop_cols = ['loan_amount', 'loan_amount_log', 'loan_id']  # drop identifiers + targets
X = df.drop(columns=[col for col in drop_cols if col in df.columns])
y = df['loan_amount_log']  # target: log-transformed loan_amount

print("âœ… Features shape:", X.shape)
print("âœ… Target shape:", y.shape)

# -----------------------------
# Step 3: Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("âœ… Train set:", X_train.shape)
print("âœ… Test set:", X_test.shape)

# -----------------------------
# Step 4A: Train Random Forest Model
# -----------------------------
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
print("âœ… RandomForestRegressor trained.")

# -----------------------------
# Step 4B: Train Linear Regression Model
# -----------------------------
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print("âœ… LinearRegression model trained.")

# -----------------------------
# Step 5: Predict and Evaluate Both Models
# -----------------------------
def evaluate_model(name, model, X_test, y_test):
    y_pred_log = model.predict(X_test)
    y_pred_rup = np.expm1(y_pred_log)
    y_test_rup = np.expm1(y_test)

    r2_log = r2_score(y_test, y_pred_log)
    mae_log = mean_absolute_error(y_test, y_pred_log)
    rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))

    r2_rup = r2_score(y_test_rup, y_pred_rup)
    mae_rup = mean_absolute_error(y_test_rup, y_pred_rup)
    rmse_rup = np.sqrt(mean_squared_error(y_test_rup, y_pred_rup))

    print(f"\nðŸ“Š {name} â€” Evaluation on Log Scale")
    print(f"RÂ²: {r2_log:.4f}, MAE: {mae_log:.4f}, RMSE: {rmse_log:.4f}")

    print(f"\nðŸ’° {name} â€” Evaluation on Original Scale (â‚¹)")
    print(f"RÂ²: {r2_rup:.4f}, MAE: â‚¹{mae_rup:,.2f}, RMSE: â‚¹{rmse_rup:,.2f}")

evaluate_model("Random Forest", rf_model, X_test, y_test)
evaluate_model("Linear Regression", lr_model, X_test, y_test)


# Evaluate both models
print()
print("Evaluating models...")

# Evaluate Random Forest
r2_rf = r2_score(y_test, rf_model.predict(X_test))
print(f"Random Forest R2: {r2_rf:.4f}")

# Evaluate Linear Regression
r2_lr = r2_score(y_test, lr_model.predict(X_test))
print(f"Linear Regression R2: {r2_lr:.4f}")

# Save only the best model based on R2
if r2_rf >= r2_lr:
    joblib.dump(rf_model, "model2_best_regressor.pkl")
    print()
    print("âœ… Random Forest saved as best model.")
else:
    joblib.dump(lr_model, "model2_best_regressor.pkl")
    print()
    print("âœ… Linear Regression saved as best model.")

# Save feature names always
joblib.dump(X.columns.tolist(), "model2_feature_names.pkl")
print("âœ… Feature names saved successfully.")


âœ… Dataset shape: (12559, 23)
âœ… Features shape: (12559, 21)
âœ… Target shape: (12559,)
âœ… Train set: (10047, 21)
âœ… Test set: (2512, 21)
âœ… RandomForestRegressor trained.
âœ… LinearRegression model trained.

ðŸ“Š Random Forest â€” Evaluation on Log Scale
RÂ²: 0.9104, MAE: 0.1783, RMSE: 0.2544

ðŸ’° Random Forest â€” Evaluation on Original Scale (â‚¹)
RÂ²: 0.9229, MAE: â‚¹1,953,723.18, RMSE: â‚¹2,531,925.72

ðŸ“Š Linear Regression â€” Evaluation on Log Scale
RÂ²: 0.8314, MAE: 0.2476, RMSE: 0.3491

ðŸ’° Linear Regression â€” Evaluation on Original Scale (â‚¹)
RÂ²: 0.8533, MAE: â‚¹2,673,830.57, RMSE: â‚¹3,492,879.31

Evaluating models...
Random Forest R2: 0.9104
Linear Regression R2: 0.8314

âœ… Random Forest saved as best model.
âœ… Feature names saved successfully.


# MODEL 2 : EVALUATION CODE :

In [56]:
# -----------------------------
# Evaluate Random Forest on Test Data
# -----------------------------
evaluate_random_forest(rf_model, X_test, y_test)

# -----------------------------
# : Save Model and Feature Names
# -----------------------------
print()
joblib.dump(rf_model, "model2_best_regressor.pkl")
print("âœ… Random Forest model saved in current directory.")

joblib.dump(X.columns.tolist(), "model2_feature_names.pkl")
print("âœ… Feature names saved in current directory.")



ðŸ“Š Random Forest â€” Evaluation on Log Scale
RÂ²: 0.9104, MAE: 0.1783, RMSE: 0.2544

ðŸ’° Random Forest â€” Evaluation on Original Scale (â‚¹)
RÂ²: 0.9229, MAE: â‚¹1,953,723.18, RMSE: â‚¹2,531,925.72

âœ… Random Forest model saved in current directory.
âœ… Feature names saved in current directory.
