<a href="https://colab.research.google.com/github/Zuhair0000/Loan_Approval_Prediction/blob/main/loan_approval_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# **Load Dataset**

In [25]:
df = pd.read_csv("loan_approval_dataset.csv")
df.columns = df.columns.str.strip()

In [26]:
df.isnull().sum()

Unnamed: 0,0
loan_id,0
no_of_dependents,0
education,0
self_employed,0
income_annum,0
loan_amount,0
loan_term,0
cibil_score,0
residential_assets_value,0
commercial_assets_value,0


In [27]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [28]:
X = df.drop(columns=["loan_id", "loan_status"])
y = df["loan_status"]

# **Train-Test Split**

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Data Preprocessing**

In [30]:
categorical_features = ["education", "self_employed" ]

numerical_features = ["no_of_dependents", "income_annum", "loan_amount",
                      "loan_term", "cibil_score", "residential_assets_value",
                      "commercial_assets_value", "luxury_assets_value", "bank_asset_value"]

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [32]:
categorical_enc = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

numerical_enc = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_enc, categorical_features),
    ("num", numerical_enc, numerical_features)
])

In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# **Model Training**

### **Logistic Regression**

In [34]:
from sklearn.linear_model import LogisticRegression

lr = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(random_state=42))
])
lr.fit(X_train, y_train)

In [35]:
lr_pred = lr.predict(X_test)

### **Random Forest**

In [36]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42))
])
rf.fit(X_train, y_train)

In [37]:
rf_pred = rf.predict(X_test)

In [38]:
rf_model = rf.named_steps["model"]

In [39]:
rf_importances = rf_model.feature_importances_

In [40]:
rf_feature_names = rf.named_steps["preprocessing"].get_feature_names_out()

In [41]:
rf_importance_df = pd.DataFrame({
    "Features": rf_feature_names,
    "Importance": rf_importances
}).sort_values(by="Importance", ascending=False)

In [42]:
rf_importance_df

Unnamed: 0,Features,Importance
8,num__cibil_score,0.768493
7,num__loan_term,0.0769
6,num__loan_amount,0.031837
9,num__residential_assets_value,0.023792
11,num__luxury_assets_value,0.02125
5,num__income_annum,0.021206
10,num__commercial_assets_value,0.019877
12,num__bank_asset_value,0.018333
4,num__no_of_dependents,0.009558
2,cat__self_employed_ No,0.00235


### **XGB model**

In [43]:
from xgboost import XGBClassifier

xgb = Pipeline([
    ("preprocessing", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    ))
])

xgb.fit(X_train, y_train)

In [44]:
xgb_pred = xgb.predict(X_test)

In [45]:
xgb_model = rf.named_steps["model"]

In [46]:
xgb_importances = rf_model.feature_importances_

In [47]:
xgb_feature_names = rf.named_steps["preprocessing"].get_feature_names_out()

In [48]:
xgb_importance_df = pd.DataFrame({
    "Features": xgb_feature_names,
    "Importance": xgb_importances
}).sort_values(by="Importance", ascending=False)

In [49]:
xgb_importance_df

Unnamed: 0,Features,Importance
8,num__cibil_score,0.768493
7,num__loan_term,0.0769
6,num__loan_amount,0.031837
9,num__residential_assets_value,0.023792
11,num__luxury_assets_value,0.02125
5,num__income_annum,0.021206
10,num__commercial_assets_value,0.019877
12,num__bank_asset_value,0.018333
4,num__no_of_dependents,0.009558
2,cat__self_employed_ No,0.00235


# **Evaluation & Comparison**

In [50]:
from sklearn.metrics import(
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
    )

In [51]:
def evaluate_model(model, X_test,y_pred, y_test):
    y_proba = model.predict_proba(X_test)[:, 1]
    return{
        "Accuracy Score": accuracy_score(y_test, y_pred),
        "F1_score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba),
        "Confusion": confusion_matrix(y_test, y_pred)
    }

In [52]:
lr_result = evaluate_model(lr, X_test, lr_pred, y_test)
lr_result

{'Accuracy Score': 0.905152224824356,
 'F1_score': 0.8716323296354992,
 'ROC-AUC': np.float64(0.9674915516755843),
 'Confusion': array([[498,  38],
        [ 43, 275]])}

In [53]:
rf_result = evaluate_model(rf, X_test, rf_pred, y_test)
rf_result

{'Accuracy Score': 0.9800936768149883,
 'F1_score': 0.9732283464566929,
 'ROC-AUC': np.float64(0.9987474185675397),
 'Confusion': array([[528,   8],
        [  9, 309]])}

In [54]:
xgb_result = evaluate_model(xgb, X_test, xgb_pred, y_test)
xgb_result

{'Accuracy Score': 0.9789227166276346,
 'F1_score': 0.9715189873417721,
 'ROC-AUC': np.float64(0.9983279357927344),
 'Confusion': array([[529,   7],
        [ 11, 307]])}

In [55]:
results_df = pd.DataFrame({
    "Logistic Regression": lr_result,
    "Random Forest": rf_result,
    "Gradient Boosting": xgb_result
}).T

results_df

Unnamed: 0,Accuracy Score,F1_score,ROC-AUC,Confusion
Logistic Regression,0.905152,0.871632,0.967492,"[[498, 38], [43, 275]]"
Random Forest,0.980094,0.973228,0.998747,"[[528, 8], [9, 309]]"
Gradient Boosting,0.978923,0.971519,0.998328,"[[529, 7], [11, 307]]"


In [56]:
import joblib
joblib.dump(rf, "loan_approval_model.pkl")

['loan_approval_model.pkl']