In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier
import pickle

In [27]:
data_frame = pd.read_csv(r'../dataset/loan_approval_dataset.csv')

In [28]:
data_frame.drop(['loan_id'],axis=1,inplace=True)

In [29]:
data_frame

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [30]:
encoder = LabelEncoder()
scaler = StandardScaler()
data_frame[' self_employed'] = encoder.fit_transform(data_frame[' self_employed'].str.lower())


In [31]:
data_frame[' education'] = encoder.fit_transform(data_frame[' education'].str.lower())
data_frame[' education']=data_frame[' education'].map({0:1,1:0})

In [32]:
data_frame[' loan_status'] = encoder.fit_transform(data_frame[' loan_status'].str.lower())
data_frame[' loan_status'] = data_frame[' loan_status'].map({0:1,1:0})

In [33]:
data_frame.columns=['no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       ' luxury_assets_value', 'bank_asset_value', 'loan_status']

In [34]:
data_frame

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,2800000,500000,3300000,800000,0
4265,0,0,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,1
4266,2,0,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,0
4267,1,0,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,1


In [65]:
data_frame['debt_to_income'] = data_frame['income_annum']/data_frame['loan_amount']
data_frame['income_ratio'] = (data_frame['income_annum']*data_frame['loan_term'])/data_frame['loan_amount']

In [66]:
x = data_frame.drop(['loan_status'],axis=1)
x

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,debt_to_income,income_ratio
0,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0.321070,3.852843
1,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0.336066,2.688525
2,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0.306397,6.127946
3,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0.267101,2.136808
4,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0.404959,8.099174
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,2800000,500000,3300000,800000,0.434783,5.217391
4265,0,0,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,0.292035,5.840708
4266,2,0,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,0.271967,4.895397
4267,1,0,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,0.320312,2.562500


In [67]:
print(data_frame.debt_to_income.values.tolist())

[0.3210702341137124, 0.3360655737704918, 0.3063973063973064, 0.2671009771986971, 0.4049586776859504, 0.35555555555555557, 0.2636363636363636, 0.38, 0.36363636363636365, 0.2558139534883721, 0.25892857142857145, 0.29515418502202645, 0.43103448275862066, 0.28888888888888886, 0.25675675675675674, 0.4392523364485981, 0.3125, 0.30851063829787234, 0.2621359223300971, 0.4315068493150685, 0.25773195876288657, 0.4142857142857143, 0.2529182879377432, 0.35714285714285715, 0.5, 0.3263157894736842, 0.2918149466192171, 0.42857142857142855, 0.2916666666666667, 0.2857142857142857, 0.38735177865612647, 0.475, 0.5, 0.38181818181818183, 0.4117647058823529, 0.5, 0.25806451612903225, 0.29411764705882354, 0.4675324675324675, 0.29411764705882354, 0.430939226519337, 0.3654618473895582, 0.4117647058823529, 0.34782608695652173, 0.48507462686567165, 0.28776978417266186, 0.2876712328767123, 0.25654450261780104, 0.2682926829268293, 0.4973821989528796, 0.2874015748031496, 0.3522267206477733, 0.5, 0.3652173913043478,

In [57]:
y = data_frame['loan_status'].values
y

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [58]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=41)

In [59]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled =scaler.transform(x_test)

# Using support Vector Machine

In [60]:
svm_model = SVC(kernel='linear', random_state=41)
svm_model.fit(x_train_scaled, y_train)
#print(f"Fitting score of the model is {svm_model.score(x_train_scaled,y)}")
#predicting from the model

y_pred = svm_model.predict(x_test_scaled)

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score is {accuracy*100:.3f}%")

score= f1_score(y_test,y_pred)
print(f'f1 score is {score}')

accuracy score is 90.398%
f1 score is 0.9232209737827715


#using random forest classifier

In [61]:
models = {
    "Logistic Regression": LogisticRegression(random_state=41),
    "SVM": SVC(kernel='linear', random_state=41),
    "Random Forest": RandomForestClassifier(random_state=41),
    "XGBoost": XGBClassifier(random_state=41)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    accuracy = accuracy_score(y_test,y_pred)
    print(y_pred.shape)
    print(y_test.shape)
    f1 = f1_score(y_test,y_pred)
    print(f"{name} - Accuracy: {accuracy * 100:.2f}%, F1 Score: {f1:.2f}")

(854,)
(854,)
Logistic Regression - Accuracy: 90.87%, F1 Score: 0.93
(854,)
(854,)
SVM - Accuracy: 90.40%, F1 Score: 0.92
(854,)
(854,)
Random Forest - Accuracy: 99.88%, F1 Score: 1.00
(854,)
(854,)
XGBoost - Accuracy: 99.88%, F1 Score: 1.00


In [62]:
rf_model = RandomForestClassifier(random_state=41)
rf_model.fit(x_train_scaled, y_train)
#print(f"Fitting score of the model is {svm_model.score(x_train_scaled,y)}")
#predicting from the model

y_pred = rf_model.predict(x_test_scaled)

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score is {accuracy*100:.3f}%")

score= f1_score(y_test,y_pred)
print(f'f1 score is {score}')

accuracy score is 99.883%
f1 score is 0.9990627928772258


#using xgboost classifier

In [63]:

xgboost_model = XGBClassifier(random_state=41)
xgboost_model.fit(x_train_scaled, y_train)
#print(f"Fitting score of the model is {svm_model.score(x_train_scaled,y)}")
#predicting from the model

y_pred = xgboost_model.predict(x_test_scaled)

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score is {accuracy*100:.3f}%")

score= f1_score(y_test,y_pred)
print(f'f1 score is {score}')

accuracy score is 99.883%
f1 score is 0.9990627928772258


In [70]:
#Test case 1
# Dependents- 8
# Not graduated 
# Self employed 
# Income - 300000
# Loan amt - 5000000
# Term - 4
# Cibil - 350
# Assets 10000, 50000, 20000, 100000
first_instance=np.array([8,0,1,300000,5000000,4,350,10000,50000,20000,100000,0.06,0.24])
first = first_instance.reshape(-1,1).T
first_scaled =  scaler.transform(first)
print(first_scaled)
prediction = xgboost_model.predict(first_scaled)
if(first_scaled[0][6]<300):
    prediction[0] = 0

if prediction[0]==1:
    print(f"Loan approved")
else:
    print("loan rejected")


[[ 3.2355641  -1.01326513  0.98402221 -1.69867972 -1.12115195 -1.20516453
  -1.45617254 -1.14788815 -1.11820773 -1.6588257  -1.50507951 -3.90803947
  -1.62529432]]
loan rejected




In [71]:
# #Test case 2
# Dependents- 4
#  graduated 
# Self employed 
# Income - 3000000
# Loan amt - 10000000
# Term - 6
# Cibil - 800
# Assets 100000, 500000, 200000, 1000000
second_instance=np.array([4,1,1,3000000,10000000,6,800,100000,500000,200000,1000000,0.3,1.8])
second = second_instance.reshape(-1,1).T
second_scaled =  scaler.transform(second)
print(second_scaled)
prediction = xgboost_model.predict(second_scaled)
print(type(prediction[0]))
if prediction[0] == 1:
    print(f"Loan approved")
else:
    print("loan rejected")


[[ 0.88123578  0.98690853  0.98402221 -0.7362205  -0.56757712 -0.8554036
   1.16337934 -1.1339795  -1.01627087 -1.63905557 -1.22656916 -0.67568412
  -0.91517696]]
<class 'numpy.int32'>
Loan approved




In [72]:
# #Test case 3
# Dependents- 3
# graduated 
# employed 
# Income - 1500000
# Loan amt - 5000000
# Term - 5
# Cibil - 600
# Assets 200000, 600000, 100000, 1500000
third_instance=np.array([3,1,1,1500000,5000000,5,600,200000,600000,100000,1500000,0.3,1.5])
second = second_instance.reshape(-1,1).T
second_scaled =  scaler.transform(second)
print(second_scaled)
prediction = xgboost_model.predict(second_scaled)
print(type(prediction[0]))
if prediction[0] == 1:
    print(f"Loan approved")
else:
    print("loan rejected")


[[ 0.88123578  0.98690853  0.98402221 -0.7362205  -0.56757712 -0.8554036
   1.16337934 -1.1339795  -1.01627087 -1.63905557 -1.22656916 -0.67568412
  -0.91517696]]
<class 'numpy.int32'>
Loan approved




In [73]:
#test case from dataset
print(x.iloc[4264,:])
print(f" 1:approved , 0:rejected | actual value is {y[4264]}")
test = np.array([5,1,1,1000000,2300000,12,317,2800000,500000,3300000,800000,0.4347,5.217])
test= test.reshape(-1,1).T
test_scaled =  scaler.transform(test)
print(test_scaled)
prediction = xgboost_model.predict(test_scaled)
print(prediction.shape)
if prediction[0] == 1:
    print(f"Loan approved")
else:
    print("Prediced is loan rejected")


no_of_dependents            5.000000e+00
education                   1.000000e+00
self_employed               1.000000e+00
income_annum                1.000000e+06
loan_amount                 2.300000e+06
loan_term                   1.200000e+01
cibil_score                 3.170000e+02
residential_assets_value    2.800000e+06
commercial_assets_value     5.000000e+05
 luxury_assets_value        3.300000e+06
bank_asset_value            8.000000e+05
debt_to_income              4.347826e-01
income_ratio                5.217391e+00
Name: 4264, dtype: float64
 1:approved , 0:rejected | actual value is 0
[[ 1.46981786  0.98690853  0.98402221 -1.44915325 -1.42008237  0.19387919
  -1.64827301 -0.7167199  -1.01627087 -1.29856996 -1.28846035  1.13847532
   0.6402532 ]]
(1,)
Prediced is loan rejected




In [74]:
pipeline = {
    'xgboost_model':xgboost_model,
    'label_encoder':encoder,
    'standardscaler':scaler
}

In [75]:
with open("../loan_approval_model.pkl",'wb') as file:
    pickle.dump(pipeline,file)