In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

#  Detect & Remove Outliers using IQR

In [2]:
def remove_outliers_iqr(data, cols):
    cleaned_data = data.copy()
    for col in cols:
        Q1 = cleaned_data[col].quantile(0.25)
        Q3 = cleaned_data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Keep only data within the bounds
        cleaned_data = cleaned_data[(cleaned_data[col] >= lower_bound) & (cleaned_data[col] <= upper_bound)]
    return cleaned_data


# Load dataset
df = pd.read_csv("loan_approval_dataset.csv")
df = df.drop("loan_id", axis=1)


#  Strip spaces from column names
df.columns = df.columns.str.strip()

# Select numerical columns
numerical_cols = ['cibil_score', 'income_annum', 'loan_term', 'loan_amount', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']

# Apply outlier removal
df_clean = remove_outliers_iqr(df, numerical_cols)

print(f"Original shape: {df.shape}, After outlier removal: {df_clean.shape}")


Original shape: (4269, 12), After outlier removal: (4173, 12)


# Encode Categorical Variables

In [3]:
categorical_cols = ['education','self_employed','loan_status']

encoder = LabelEncoder()
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = encoder.fit_transform(df_clean[col].astype(str))

# Feature Scaling

In [4]:
scaler = StandardScaler()
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])


# Feature Importance

In [5]:
if 'loan_status' in df_clean.columns:
    X = df_clean.drop('loan_status', axis=1)
    y = df_clean['loan_status']

    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)

    importances = model.feature_importances_

    feature_importance = sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True)

    print("\n Feature Importance (Random Forest):")
    for feat, imp in feature_importance:
        print(f"{feat}: {imp:.4f}")
else:
    print(" Target column 'loan_status' not found. Cannot compute feature importance.")


 Feature Importance (Random Forest):
cibil_score: 0.8110
loan_term: 0.0610
loan_amount: 0.0298
luxury_assets_value: 0.0189
income_annum: 0.0187
residential_assets_value: 0.0183
commercial_assets_value: 0.0159
bank_asset_value: 0.0142
no_of_dependents: 0.0076
self_employed: 0.0023
education: 0.0023


# Class-Imbalance

In [6]:
from collections import Counter
from imblearn.over_sampling import SMOTE

# Target column (update if different in your dataset)
target_col = "loan_status"   # e.g., 'loan_status' or 'approval_status'

# Check class distribution
print("\nClass distribution before balancing:")
print(df_clean[target_col].value_counts())

# Define imbalance threshold (e.g., minority class < 40% of majority class)
class_counts = df_clean[target_col].value_counts()
majority = class_counts.max()
minority = class_counts.min()

if minority / majority < 0.4:   # condition → imbalance exists
    print("\n Imbalance detected! Applying SMOTE oversampling...")

    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Merge back into balanced DataFrame
    df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), 
                             pd.DataFrame(y_resampled, columns=[target_col])], axis=1)

    print("\nClass distribution after balancing:")
    print(df_balanced[target_col].value_counts())
else:
    print("\n Dataset is already balanced. No action taken.")
    df_balanced = df_clean.copy()


Class distribution before balancing:
loan_status
0    2599
1    1574
Name: count, dtype: int64

 Dataset is already balanced. No action taken.


In [7]:
# Save the cleaned and preprocessed dataset to CSV
df_clean.to_csv("loan_preprocess_dataset.csv", index=False)

print("✅ Preprocessed dataset saved as 'loan_preprocess_dataset.csv'")


✅ Preprocessed dataset saved as 'loan_preprocess_dataset.csv'
