**Check File Exists**



In [7]:
import os

os.listdir()


['.config', 'loan_approval_dataset.csv', '.ipynb_checkpoints', 'sample_data']

**Load Data using Pandas**

In [8]:
import pandas as pd

data = pd.read_csv("loan_approval_dataset.csv")


**View the Data**

In [3]:
data.head()


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


**And shape:**

In [5]:
data.shape


(4269, 13)

**And columns:**

In [6]:
data.columns


Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

**Check Dataset Info**

In [9]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


**Check Missing Values**

In [10]:
data.isnull().sum()


Unnamed: 0,0
loan_id,0
no_of_dependents,0
education,0
self_employed,0
income_annum,0
loan_amount,0
loan_term,0
cibil_score,0
residential_assets_value,0
commercial_assets_value,0


**Drop loan_id**


In [11]:
data.drop('loan_id', axis=1, inplace=True)


In [12]:
data.head()


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [13]:
data.columns


Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

**Feature Encoding (Convert Text → Numbers)**

**Check Data Types**

In [14]:
data.dtypes


Unnamed: 0,0
no_of_dependents,int64
education,object
self_employed,object
income_annum,int64
loan_amount,int64
loan_term,int64
cibil_score,int64
residential_assets_value,int64
commercial_assets_value,int64
luxury_assets_value,int64


**(Safety) Clean Column Names**

In [16]:
data.columns = data.columns.str.strip()


**See What Values Exist**

In [17]:
print(data['education'].unique())
print(data['self_employed'].unique())
print(data['loan_status'].unique())


[' Graduate' ' Not Graduate']
[' No' ' Yes']
[' Approved' ' Rejected']


**Encode Using LabelEncoder**

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['education'] = le.fit_transform(data['education'])
data['self_employed'] = le.fit_transform(data['self_employed'])
data['loan_status'] = le.fit_transform(data['loan_status'])


**Verify Encoding**

In [19]:
data.dtypes


Unnamed: 0,0
no_of_dependents,int64
education,int64
self_employed,int64
income_annum,int64
loan_amount,int64
loan_term,int64
cibil_score,int64
residential_assets_value,int64
commercial_assets_value,int64
luxury_assets_value,int64


In [20]:
data.head()


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


**Split Data into X (Features) and y (Target)**

X → Inputs (customer data)

y → Output (loan_status)

Create X and y

In [21]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']


**Check Shapes**

In [22]:
print(X.shape)
print(y.shape)


(4269, 11)
(4269,)


**Train-Test Split**

**Import Function**

In [23]:
from sklearn.model_selection import train_test_split


**Split Data**

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**Check Shapes**

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(3415, 11)
(854, 11)
(3415,)
(854,)


**Train Your First Model (Logistic Regression)**

Import Model

In [26]:
from sklearn.linear_model import LogisticRegression


Create Model

In [27]:
model = LogisticRegression(max_iter=1000)


Train Model

In [28]:
model.fit(X_train, y_train)


Make Predictions

In [29]:
y_pred = model.predict(X_test)


Check Accuracy

In [30]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7985948477751756


**Make One Real Prediction**

Pick One Test Sample

In [31]:
X_test.iloc[0]


Unnamed: 0,1703
no_of_dependents,5
education,0
self_employed,0
income_annum,5400000
loan_amount,19700000
loan_term,20
cibil_score,423
residential_assets_value,6500000
commercial_assets_value,10000000
luxury_assets_value,15700000


Predict It

In [32]:
sample = X_test.iloc[0:1]
prediction = model.predict(sample)
print(prediction)


[1]


Interpret

[1] → Approved  
[0] → Rejected


In [34]:
sample = X_test.iloc[0:1]
prediction = model.predict(sample)
print("Prediction:", prediction)
print("Actual:", y_test.iloc[0])


Prediction: [1]
Actual: 1
