## Import Dependencies


In [12]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm 

## Data Colletion and Processing

In [15]:
loan_data = pd.read_csv('loan_dataset.csv')

In [17]:
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [19]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [27]:
loan_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


# Handling the null dataset

In [81]:
df = loan_data.drop(columns=['Loan_ID']) 

print(df.isnull().sum())

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [83]:
missing_percent = df.isnull().mean() * 100
print(missing_percent)

Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64



##### Checking for null data above shows that the missing percentage in each feature is not that high so inputation for missing values should be fine


In [86]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed']
for col in categorical_cols:
    mode = df[col].mode()[0]
    df.fillna({col:mode}, inplace = True)

numeric_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
for col in numeric_cols:
    median = df[col].median()
    df.fillna({col:median}, inplace=True)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


## Encoding Categorical Data

In [91]:
from sklearn.preprocessing import LabelEncoder

In [93]:
#Label Encode the target variable
le = LabelEncoder()

df['Loan_Status'] = le.fit_transform(df['Loan_Status'])

#one-hot encode other categorical features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
df = pd.get_dummies(df, columns=categorical_features)

In [95]:
print(df.columns.tolist())

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Gender_Female', 'Gender_Male', 'Married_No', 'Married_Yes', 'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Graduate', 'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes', 'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban']


In [101]:
df.shape

(614, 21)

## Split data into train/test

In [120]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

# Split 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [122]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Pipeline: scaling + logistic regression
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
pipe.fit(X_train, y_train)

# Predict and evaluate
y_pred_scaled = pipe.predict(X_test)
print("Logistic Regression WITH scaling:")
print(classification_report(y_test, y_pred_scaled))

Logistic Regression WITH scaling:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



### SVM

In [132]:
# Pipeline: scale + SVM with default RBF kernel

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

svm_pipe = make_pipeline(StandardScaler(), SVC())

# Train
svm_pipe.fit(X_train, y_train)

# Predict
y_pred_svm = svm_pipe.predict(X_test)

# Evaluate
print("SVM with scaling:")
print(classification_report(y_test, y_pred_svm))

SVM with scaling:
              precision    recall  f1-score   support

           0       1.00      0.42      0.59        43
           1       0.76      1.00      0.86        80

    accuracy                           0.80       123
   macro avg       0.88      0.71      0.73       123
weighted avg       0.85      0.80      0.77       123



## SVM with balanced class weight

In [137]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Pipeline with class_weight='balanced'
svm_balanced_pipe = make_pipeline(
    StandardScaler(), 
    SVC(class_weight='balanced')
)

# Train
svm_balanced_pipe.fit(X_train, y_train)

# Predict
y_pred_balanced = svm_balanced_pipe.predict(X_test)

# Evaluate
print("SVM with balanced class weights:")
print(classification_report(y_test, y_pred_balanced))


SVM with balanced class weights:
              precision    recall  f1-score   support

           0       0.58      0.44      0.50        43
           1       0.73      0.82      0.78        80

    accuracy                           0.69       123
   macro avg       0.65      0.63      0.64       123
weighted avg       0.68      0.69      0.68       123



## Feature importance with random forest

In [139]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_

# Create a DataFrame for easier viewing
import pandas as pd
feat_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feat_importance = feat_importance.sort_values(by='Importance', ascending=False)

print(feat_importance)

                    Feature  Importance
4            Credit_History    0.237304
0           ApplicantIncome    0.191275
2                LoanAmount    0.180331
1         CoapplicantIncome    0.110729
3          Loan_Amount_Term    0.046437
18  Property_Area_Semiurban    0.025175
17      Property_Area_Rural    0.019688
9              Dependents_0    0.018773
19      Property_Area_Urban    0.018279
10             Dependents_1    0.018136
8               Married_Yes    0.015942
14   Education_Not Graduate    0.015156
13       Education_Graduate    0.014684
7                Married_No    0.014533
6               Gender_Male    0.014056
5             Gender_Female    0.013443
11             Dependents_2    0.012596
15         Self_Employed_No    0.012300
16        Self_Employed_Yes    0.011521
12            Dependents_3+    0.009644


## Feature Engineering

In [143]:
# Add TotalIncome
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# Drop original income columns
df = df.drop(columns=['ApplicantIncome', 'CoapplicantIncome'])

# List of low-importance features to drop (based on previous importance)
low_importance_features = [
    'Dependents_3+',
    'Self_Employed_Yes',
    'Self_Employed_No',
    'Gender_Female',
    'Gender_Male',
    'Married_No',
    'Education_Graduate',
    'Education_Not Graduate',
    # Add any other very low importance features you want to drop
]

# Drop low importance features if they exist in df
for col in low_importance_features:
    if col in df.columns:
        df = df.drop(columns=[col])

# Now your df is updated. Next steps: split, scale, and model.

# Example split:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## SVW Algorithn with Feature Engineering

In [147]:
svm_pipe = make_pipeline(StandardScaler(), SVC())

# Train
svm_pipe.fit(X_train, y_train)

# Predict
y_pred_svm = svm_pipe.predict(X_test)

# Evaluate
print("SVM with feature importance and scaling:")
print(classification_report(y_test, y_pred_svm))

SVM with feature importance and scaling:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123

