In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [11]:
df = pd.read_csv('loan_prediction.csv')
print(df.head())

   Loan_ID    Gender Married Dependents Education Self_Employed  \
0        0  LP001002    Male         No         0      Graduate   
1        1  LP001003    Male        Yes         1      Graduate   
2        2  LP001005    Male        Yes         0      Graduate   
3        3  LP001006    Male        Yes         0  Not Graduate   
4        4  LP001008    Male         No         0      Graduate   

  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0              No               5849         0.0               NaN   
1              No               4583      1508.0             128.0   
2             Yes               3000         0.0              66.0   
3              No               2583      2358.0             120.0   
4              No               6000         0.0             141.0   

   Credit_History  Property_Area Loan_Status Unnamed: 13  
0           360.0            1.0       Urban           Y  
1           360.0            1.0       Rural           N  

In [12]:
#I’ll drop the loan id column and move further
df = df.drop('Loan_ID', axis=1)

In [13]:
#Now let’s have a look if the data has missing values or not
df.isnull().sum()

Gender                0
Married              13
Dependents            3
Education            15
Self_Employed         0
ApplicantIncome      32
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     22
Credit_History       14
Property_Area        50
Loan_Status           0
Unnamed: 13           0
dtype: int64

In [14]:
#The data has missing values in some of the categorical columns and some numerical columns. Let’s have a look at the descriptive statistics of the dataset before filling in the missing values:
print(df.describe())


       CoapplicantIncome    LoanAmount  Loan_Amount_Term  Credit_History  \
count         614.000000    614.000000        592.000000       600.00000   
mean         5403.459283   1621.245798        146.412162       342.00000   
std          6109.041673   2926.248369         85.587325        65.12041   
min           150.000000      0.000000          9.000000        12.00000   
25%          2877.500000      0.000000        100.000000       360.00000   
50%          3812.500000   1188.500000        128.000000       360.00000   
75%          5795.000000   2297.250000        168.000000       360.00000   
max         81000.000000  41667.000000        700.000000       480.00000   

       Property_Area  
count     564.000000  
mean        0.842199  
std         0.364878  
min         0.000000  
25%         1.000000  
50%         1.000000  
75%         1.000000  
max         1.000000  


In [15]:
#Now let’s fill in the missing values. In categorical columns, we can fill in missing values with the mode of each column. The mode represents the value that appears most often in the column and is an appropriate choice when dealing with categorical data:
# Fill missing values in categorical columns with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)


In [16]:
# Fill missing values in LoanAmount with the median
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

# Fill missing values in Loan_Amount_Term with the mode
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

# Fill missing values in Credit_History with the mode
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

In [17]:
#Now let’s have a look at the distribution of the loan status column:
import plotly.express as px

loan_status_count = df['Loan_Status'].value_counts()
fig_loan_status = px.pie(loan_status_count,
                         names=loan_status_count.index,
                         title='Loan Approval Status')
fig_loan_status.show()


In [18]:
#Now let’s have a look at the distribution of the gender column:
gender_count = df['Gender'].value_counts()
fig_gender = px.bar(gender_count,
                    x=gender_count.index,
                    y=gender_count.values,
                    title='Gender Distribution')
fig_gender.show()
