In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
file_path = r"C:\Users\User\Downloads\loan_prediction.csv"
df = pd.read_csv(file_path)

print(df.head())
print(df.info())
print("Number of Missing Data:\n", df.isnull().sum())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [5]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [7]:
df = df.copy()

df.update(df[["Gender", "Married", "Education", "Self_Employed", "Property_Area"]].fillna(df.mode().iloc[0]))

df["LoanAmount"] = df["LoanAmount"].fillna(df["LoanAmount"].median())
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].median())
df["Credit_History"] = df["Credit_History"].fillna(df["Credit_History"].mode()[0])

for col in ["Gender", "Married", "Education", "Self_Employed", "Property_Area", "Loan_Status"]:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop(columns=["Loan_ID", "Loan_Status"])
y = df["Loan_Status"]

y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

print("Training Set Size:", X_train.shape)
print("Test Set Size:", X_test.shape)

Training Set Size: (491, 11)
Test Set Size: (123, 11)


In [11]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc:.4f}")

Random Forest (Bagging): 0.8537
Gradient Boosting: 0.8374
XGBoost: 0.8130
LightGBM: 0.8049
CatBoost: 0.8374
Voting Classifier: 0.8211


In [13]:
rf = RandomForestClassifier(n_estimators=100, random_state=11)
gb = GradientBoostingClassifier(n_estimators=100, random_state=11)
xgb = XGBClassifier(n_estimators=100, eval_metric="logloss", random_state=11)
lgbm = LGBMClassifier(n_estimators=100, random_state=11, verbose=-1)
catboost = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, verbose=0, random_state=11)
bagging = BaggingClassifier(estimator=rf, n_estimators=10, random_state=11)
voting = VotingClassifier(estimators=[
    ('rf', rf), ('gb', gb), ('xgb', xgb), ('lgbm', lgbm), ('catboost', catboost)
], voting='hard')

models = {
    "Random Forest (Bagging)": bagging,
    "Gradient Boosting": gb,
    "XGBoost": xgb,
    "LightGBM": lgbm,
    "CatBoost": catboost,
    "Voting Classifier": voting
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc})
    print(f"\n {name} The results:\n")
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("\n Comparison of All Models \n")
print(results_df)


 Random Forest (Bagging) The results:

              precision    recall  f1-score   support

           0       0.92      0.60      0.73        40
           1       0.84      0.98      0.90        83

    accuracy                           0.85       123
   macro avg       0.88      0.79      0.81       123
weighted avg       0.86      0.85      0.84       123


 Gradient Boosting The results:

              precision    recall  f1-score   support

           0       0.86      0.60      0.71        40
           1       0.83      0.95      0.89        83

    accuracy                           0.84       123
   macro avg       0.84      0.78      0.80       123
weighted avg       0.84      0.84      0.83       123


 XGBoost The results:

              precision    recall  f1-score   support

           0       0.77      0.60      0.68        40
           1       0.83      0.92      0.87        83

    accuracy                           0.81       123
   macro avg       0.80      0