In [2]:
import pandas as pd
import numpy as np

### Loading dataset

In [3]:
data = pd.read_csv("dataset/loan_approval_dataset.csv")
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## Data Preprocessing

### Missing value

In [4]:
# Check for missing value
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64


### Feature Engineering

In [5]:
# Feature Engineering
# Create new feature: total_assets_value
data['total_assets_value'] = data['residential_assets_value'] + data['commercial_assets_value'] + data['luxury_assets_value'] + data['bank_asset_value']
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,total_assets_value
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,50700000
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,17000000
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected,57700000
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected,52700000
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected,55000000


In [6]:
data = data[['loan_id', 'no_of_dependents', 'education', 'self_employed',
    'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
    'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value',
    'bank_asset_value', 'total_assets_value', 'loan_status']]
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,total_assets_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,50700000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,17000000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,57700000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,52700000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,55000000,Rejected


### Encoding Categorical Feature

In [7]:
print(data['education'].unique())

[' Graduate' ' Not Graduate']


In [8]:
# Replace 'Graduate' with 1 and 'Not Graduate' with 0 in the 'education' column
data['education'] = data['education'].replace({' Graduate': 1, ' Not Graduate': 0})

# Replace 'Yes' with 1 and 'No' with 0 in the 'education' column
data['self_employed'] = data['self_employed'].replace({' Yes': 1, ' No': 0})

# Replace 'Approved' with 1 and 'Rejected' with 0 in the 'loan_status' column
data['loan_status'] = data['loan_status'].replace({' Approved' : 1, ' Rejected' : 0})

In [9]:
data.tail()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,total_assets_value,loan_status
4264,4265,5,1,1,1000000,2300000,12,317,2800000,500000,3300000,800000,7400000,0
4265,4266,0,0,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,20000000,1
4266,4267,2,0,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,39000000,0
4267,4268,1,0,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,28800000,1
4268,4269,1,1,0,9200000,29700000,10,607,17800000,11800000,35700000,12000000,77300000,1


## Model Building - Random Forest

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Splitting the data into features and target variable
X = data.drop(["loan_id", "loan_status"], axis=1)
y = data["loan_status"]

In [12]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [13]:
# Model Training
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

## Model Evaluation - Random Forest

In [14]:
# Model Evaluation
y_pred = model_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9789227166276346

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       163
           1       0.98      0.99      0.98       264

    accuracy                           0.98       427
   macro avg       0.98      0.98      0.98       427
weighted avg       0.98      0.98      0.98       427



## Model Saving - Random Forest

In [15]:
from joblib import dump

# Save the trained model to a file
dump(model_rf, 'trained_models\model_rf.joblib')

['model_rf.joblib']

## Model Building - SVM (Support Vector Machine)

In [16]:
from sklearn.svm import SVC

In [17]:
# Model Training with Support Vector Machine
model_svm = SVC(random_state=42)
model_svm.fit(X_train, y_train)

## Model Evaluation - SVM (Support Vector Machine)

In [18]:
# Model Evaluation
y_pred = model_svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6182669789227166

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       163
           1       0.62      1.00      0.76       264

    accuracy                           0.62       427
   macro avg       0.31      0.50      0.38       427
weighted avg       0.38      0.62      0.47       427



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Model Saving - SVM (Support Vector Machine)

In [19]:
from joblib import dump

# Save the trained model to a file
dump(model_svm, 'trained_models\model_svm.joblib')

['model_svm.joblib']

## Model Building - Gradient Boosting Classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
# Model Training with Gradient Boosting Classifier
model_gbc = GradientBoostingClassifier(random_state=42)
model_gbc.fit(X_train, y_train)

## Model Evaluation - Gradient Boosting Classifier

In [22]:
# Model Evaluation
y_pred = model_gbc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9742388758782201

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97       163
           1       0.97      0.98      0.98       264

    accuracy                           0.97       427
   macro avg       0.97      0.97      0.97       427
weighted avg       0.97      0.97      0.97       427



## Model Saving - Gradient Boosting Classifier

In [23]:
from joblib import dump

# Save the trained model to a file
dump(model_gbc, 'trained_models\model_gbc.joblib')

['model_gbc.joblib']

## Model Building - CatBoost

In [25]:
pip install catboost

Collecting catboostNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/e2/63/379617e3d982e8a66c9d66ebf4621d3357c7c18ad356473c335bffd5aba6/catboost-1.2.2-cp311-cp311-win_amd64.whl.metadata
  Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 0.0/47.0 kB ? eta -:--:--
     -------- ------------------------------- 10.2/47.0 kB ? eta -:--:--
     -------- ------------------------------- 10.2/47.0 kB ? eta -:--:--
     ---------------- --------------------- 20.5/47.0 kB 165.2 kB/s eta 0:00:01
     --------------------------------- ---- 41.0/47.0 kB 219.4 kB/s eta 0:00:01
     --------------------------------- ---- 41.0/47.0 kB 219.4 kB/s eta 0:00:01
     -------------------------------------- 47.0/47.0 kB 157.3 kB/s eta 0:00:00
Do

In [26]:
from catboost import CatBoostClassifier

In [30]:
# Initialize the CatBoostClassifier
model_catboost = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=42)

# Train the model on the training data
model_catboost.fit(X_train, y_train, verbose=100)


0:	learn: 0.4894722	total: 4.76ms	remaining: 4.75s
100:	learn: 0.0197596	total: 456ms	remaining: 4.06s
200:	learn: 0.0091233	total: 928ms	remaining: 3.69s
300:	learn: 0.0064580	total: 1.4s	remaining: 3.26s
400:	learn: 0.0058052	total: 1.84s	remaining: 2.75s
500:	learn: 0.0053583	total: 2.25s	remaining: 2.24s
600:	learn: 0.0050203	total: 2.65s	remaining: 1.76s
700:	learn: 0.0045324	total: 3.05s	remaining: 1.3s
800:	learn: 0.0043207	total: 3.44s	remaining: 856ms
900:	learn: 0.0041414	total: 3.84s	remaining: 422ms
999:	learn: 0.0039400	total: 4.24s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a4070b3d10>

In [31]:
# Make predictions on the test data
catboost_predictions = model_catboost.predict(X_test)

# Optionally, you can also obtain predicted probabilities
catboost_probs = model_catboost.predict_proba(X_test)[:, 1]


## Model Evaluation - CatBoost

In [32]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, catboost_predictions)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, catboost_predictions))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, catboost_predictions))


Accuracy: 0.9765807962529274
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       163
           1       0.98      0.98      0.98       264

    accuracy                           0.98       427
   macro avg       0.98      0.98      0.98       427
weighted avg       0.98      0.98      0.98       427

Confusion Matrix:
[[158   5]
 [  5 259]]


## Model Saving - CatBoost

In [34]:
from joblib import dump

# Save the trained model to a file
dump(model_catboost, 'trained_models\model_catboost.joblib')

['trained_models\\model_catboost.joblib']

### Storing Test data to CSV

In [24]:
import pandas as pd

# Concatenate X_test and y_test horizontally
test_data = pd.concat([X_test, y_test], axis=1)

# Specify the file path where you want to save the CSV file
csv_file_path = 'test_data.csv'

# Store the test data DataFrame into a CSV file
test_data.to_csv(csv_file_path, index=False)
