Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# importing dataset in dataframe


data = pd.read_csv('loan_prediction.csv')

In [3]:
# copy data in df

df = data.copy()

## Display Top 5 Rows of The Dataset

In [4]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# we get following information 

# Loan_ID : Unique Loan ID

# Gender : Male/ Female

# Married : Applicant married (Y/N)

# Dependents : Number of dependents

# Education : Applicant Education (Graduate/ Under Graduate)

# Self_Employed : Self employed (Y/N)

# ApplicantIncome : Applicant income

# CoapplicantIncome : Coapplicant income

# LoanAmount : Loan amount in thousands of dollars

# Loan_Amount_Term : Term of loan in months

# Credit_History : Credit history meets guidelines yes or no

# Property_Area : Urban/ Semi Urban/ Rural

# Loan_Status : Loan approved (Y/N) this is the target variable

###  Check Last 5 Rows of The Dataset

In [6]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


###  Shape of Our Dataset (Number of Rows And Number of Columns)

In [7]:
df.shape

(614, 13)

## Number of Rows are 614
## Number of Columns  are 13

###  Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [None]:
## The dataset contains information 

# 1) Loan_ID: A unique identifier for each loan application.
# 2) Gender: The gender of the applicant, with 601 non-null values available.
# 3) Married: Indicates whether the applicant is married or not, with 611 non-null values present.
# 4) Dependents: Represents the number of dependents the applicant has, with 599 non-null values.
# 5) Education: Denotes the level of education of the applicant, with no missing values.
# 6) Self_Employed: Indicates whether the applicant is self-employed or not, with 582 non-null values.
# 7) ApplicantIncome: The income of the primary applicant, given as an integer.
# 8) CoapplicantIncome: The income of the co-applicant, if any, provided as a float.
# 9) LoanAmount: The amount of the loan applied for, with 592 non-null values.
# 10) Loan_Amount_Term: Specifies the term of the loan in months, with 600 non-null values.
# 11) Credit_History: Represents the credit history of the applicant, with 564 non-null values.
# 12) Property_Area: Describes the location of the property involved in the loan application, with no missing values.
# 13) Loan_Status: Indicates whether the loan was approved or not, with no missing values.

In [9]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [None]:
# 1) The average applicant income is approximately 5403.46, with a standard deviation of 6109.04.
# 2) Coapplicant income averages around 1621.25, with a standard deviation of 2926.25.
# 3) The mean loan amount is 146.41 thousand, with a standard deviation of 85.59 thousand.
# 4) The average loan term is 342 months (approximately 28.5 years), with a standard deviation of 65.12 months.
# 5) Credit history is predominantly positive, with an average score of 0.84 and a standard deviation of 0.36.

### Check Null Values In The Dataset

In [10]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
##  We get the null values in our data set. below columns found null values
## Gender
## Married
## Dependents
## Self employed
## Loan amount
## Loan amount term
## Credict Historyr

## other columns observed have no any null values    

# we need to check null values in percentage 

In [11]:
df.isnull().sum()*100 / len(df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

# above found self employed and credict history columns more percentage of null values
# and remaining columns less percentage of null values

###  Handling The missing Values

In [12]:
# we didnt need to loan id column so we droping them

df = df.drop('Loan_ID',axis=1)


In [13]:
df.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


In [14]:
# below columns have null value so we need to droping null values

columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

In [15]:
df = df.dropna(subset=columns)

In [16]:
df.isnull().sum()*100 / len(df)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

## in column Self employed and credit history more percentage of null value so
## we need to fill up them

In [17]:
df['Self_Employed'].unique()


array(['No', 'Yes', nan], dtype=object)

In [18]:
df['Self_Employed'].mode()[0]

'No'

## in self employed column we applied mode to fill up null values

In [19]:
df['Self_Employed'] =df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [20]:
df['Credit_History'].mode()[0]

1.0

In [21]:
df['Credit_History'] =df['Credit_History'].fillna(df['Credit_History'].mode()[0])

# above column credict history filled null values with using mode 

In [22]:
df.isnull().sum()*100 / len(df)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

## we observed didnt have any null values in our dataset

### Handling Categorical Columns

In [23]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [24]:
## we need to checked Dependents column unique value


df['Dependents'].unique()

array(['1', '0', '2', '3+'], dtype=object)

In [25]:
## we get 4 types of value counts 0,1,2 & 3+
## we need to replace 3+ to 3 & also we need change datatype object to integer

In [26]:
df['Dependents'] =df['Dependents'].replace("3+","3").astype('int')

In [27]:
df['Dependents'].unique()

array([1, 0, 2, 3])

## after replacing the we get above unique value

In [28]:
df['Gender'].unique()


array(['Male', 'Female'], dtype=object)

## we need find all columns unique values

In [29]:
df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [30]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [31]:
df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [32]:
df['Property_Area'].unique()

array(['Rural', 'Urban', 'Semiurban'], dtype=object)

In [33]:
df['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [34]:
## we find all unique values but we need replace these unique values and data type
## so we using lambda and map function

In [35]:
df['Gender'] = list(map(lambda x: 0 if x=='Female' else 1, df['Gender']))

In [36]:
df['Married'] = list(map(lambda x: 0 if x=='No' else 1, df['Married']))

In [37]:
df['Education'] = list(map(lambda x: 0 if x=='Not Graduate' else 1, df['Education']))

In [38]:
df['Self_Employed'] = list(map(lambda x: 0 if x=='No' else 1, df['Self_Employed']))

In [39]:
df['Property_Area'] = list(map(lambda x: 0 if x=='Rural' else (1 if x=='Semiurban' else 2), df['Property_Area']))

In [40]:
df['Loan_Status'] = list(map(lambda x: 0 if x=='N' else 1, df['Loan_Status']))

In [41]:
## after replacing and changing data type we get

In [42]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


### we need to Store Feature Matrix In X And Response (Target) In Vector y

In [43]:
X = df.drop('Loan_Status',axis=1)

In [44]:
y = df['Loan_Status']

In [45]:
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

### We need Feature Scaling bacause some columns values are very large

In [46]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [47]:
# below columns values are very large 

cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

In [48]:
## importing standard scaler

from sklearn.preprocessing import StandardScaler


In [49]:
scaler= StandardScaler()


In [50]:
##  now data transform in X

X[cols]=scaler.fit_transform(X[cols])

In [51]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,2
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,2
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,2
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,3,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,2
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,2


In [52]:
from sklearn.model_selection import train_test_split


In [53]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=24)

In [54]:
## importing alagorithms 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [55]:
lr = LogisticRegression()
svc = SVC(kernel="linear")
dts = DecisionTreeClassifier()
rfc =RandomForestClassifier()
gbc =GradientBoostingClassifier()



In [56]:
from sklearn.metrics import accuracy_score

d = []
for i in [lr, svc, dts, rfc, gbc]:
    i.fit(X_tr, y_tr)
    y_pr = i.predict(X_te)
    acc_score = accuracy_score(y_te, y_pr)  # Renamed variable
    training_score = i.score(X_tr, y_tr)
    
    d.append({"Model": i, "accuracy_Score": acc_score, "training_score": training_score})

In [57]:
d

[{'Model': LogisticRegression(),
  'accuracy_Score': 0.7567567567567568,
  'training_score': 0.8212669683257918},
 {'Model': SVC(kernel='linear'),
  'accuracy_Score': 0.7657657657657657,
  'training_score': 0.8167420814479638},
 {'Model': DecisionTreeClassifier(),
  'accuracy_Score': 0.7117117117117117,
  'training_score': 1.0},
 {'Model': RandomForestClassifier(),
  'accuracy_Score': 0.7117117117117117,
  'training_score': 1.0},
 {'Model': GradientBoostingClassifier(),
  'accuracy_Score': 0.7207207207207207,
  'training_score': 0.918552036199095}]

In [58]:
model_details = pd.DataFrame(d)

In [59]:
model_details

Unnamed: 0,Model,accuracy_Score,training_score
0,LogisticRegression(),0.756757,0.821267
1,SVC(kernel='linear'),0.765766,0.816742
2,DecisionTreeClassifier(),0.711712,1.0
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.711712,1.0
4,([DecisionTreeRegressor(criterion='friedman_ms...,0.720721,0.918552


##  we get SVC model more accuracy score

In [60]:
# model save

In [61]:
# importing joblib
import joblib

In [62]:
joblib.dump(svc, "Loan_status_prediction_joblib") 

['Loan_status_prediction_joblib']

## for model predict loading file

In [63]:
model_svc = joblib.load("Loan_status_prediction_joblib")

In [64]:
model_svc.predict(X_te)

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1], dtype=int64)

# making a dataframe and predicting model

In [65]:

new_model = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [66]:
new_model

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,2889,0.0,45,180,0,1


In [67]:
result = svc.predict(new_model)


In [68]:
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved
