In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Business Scenario
A fintech company provides instant personal loans through a mobile application.

To reduce risk, the company wants an AI-based decision system that predicts whether a loan should be Approved or Rejected.
## Challenges:
- Customer data is not linearly separable
- Simple linear rules fail
- Decision boundaries may be curved or complex
- The solution must be interactive and deployable
- You are hired as a Machine Learning Engineer to build and deploy this system.
 
1. Load the dataset and study applicant attributes such as:
    - Applicant income
    - Loan amount
    - Credit history
    - Employment status
2. Identify features that may influence loan approval.
3. Build three different SVM models using:
    - Linear kernel
    - Polynomial kernel
    - RBF kernel
4. Train each model and evaluate performance using:
    - Accuracy
    - Precision / Recall
5. Compare results and identify:
    - Which kernel handles non-linear patterns better
    - Which kernel generalizes best on unseen data

# Loading Dataset

In [143]:
path_train = '../Data_Source/Kaggle/Loan_Prediction_Problem_Datasets/train_u6lujuX_CVtuZ9i.csv'
path_test = '../Data_Source/Kaggle/Loan_Prediction_Problem_Datasets/test_Y3wMUE5_7gLdaTN.csv'
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

# Data Inspection

In [144]:
train.shape

(614, 13)

In [145]:
test.shape

(367, 12)

In [146]:
print('Train: \n', train.columns)
print('Test: \n', test.columns)

Train: 
 Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
Test: 
 Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


In [147]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [148]:
train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [149]:
test.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

# Data Separation

In [150]:
selected_features_train = ['ApplicantIncome', 'LoanAmount', 'Credit_History', 'Self_Employed', 'Loan_Status']
selected_features_test = ['ApplicantIncome', 'LoanAmount', 'Credit_History', 'Self_Employed']

In [151]:
train = train[selected_features_train]
test = test[selected_features_test]

# Data Preprocessing

# Handling Missing values

In [152]:
train.isna().sum()

ApplicantIncome     0
LoanAmount         22
Credit_History     50
Self_Employed      32
Loan_Status         0
dtype: int64

In [153]:
test.isna().sum()

ApplicantIncome     0
LoanAmount          5
Credit_History     29
Self_Employed      23
dtype: int64

In [154]:
train.dtypes

ApplicantIncome      int64
LoanAmount         float64
Credit_History     float64
Self_Employed       object
Loan_Status         object
dtype: object

In [155]:
# LoanAmount - mean
train['LoanAmount'] = train['LoanAmount'].fillna(train['LoanAmount'].mean())
test['LoanAmount'] = test['LoanAmount'].fillna(test['LoanAmount'].mean())

In [156]:
# Credit_History - median
train['Credit_History'] = train['Credit_History'].fillna(train['Credit_History'].median())
test['Credit_History'] = test['Credit_History'].fillna(test['Credit_History'].median())

In [157]:
# Self_Employed - mode
train['Self_Employed'] = train['Self_Employed'].fillna(train['Self_Employed'].mode()[0])
test['Self_Employed'] = test['Self_Employed'].fillna(test['Self_Employed'].mode()[0])

In [158]:
train.isna().sum()

ApplicantIncome    0
LoanAmount         0
Credit_History     0
Self_Employed      0
Loan_Status        0
dtype: int64

In [159]:
test.isna().sum()

ApplicantIncome    0
LoanAmount         0
Credit_History     0
Self_Employed      0
dtype: int64

## Handling Duplicates

In [160]:
train.duplicated().sum()

2

In [161]:
test.duplicated().sum()

1

In [162]:
train.drop_duplicates(inplace=True)

In [163]:
test.drop_duplicates(inplace=True)

# Encoding

In [164]:
train.dtypes

ApplicantIncome      int64
LoanAmount         float64
Credit_History     float64
Self_Employed       object
Loan_Status         object
dtype: object

In [165]:
test.dtypes

ApplicantIncome      int64
LoanAmount         float64
Credit_History     float64
Self_Employed       object
dtype: object

In [166]:
# Label Encoding
le = LabelEncoder()
train['Self_Employed'] = le.fit_transform(train['Self_Employed'])
test['Self_Employed'] = le.transform(test['Self_Employed'])

train['Loan_Status'] = le.fit_transform(train['Loan_Status'])

In [167]:
train.Self_Employed.unique()

array([0, 1])

In [168]:
test.Self_Employed.unique()

array([0, 1])

# Feature Separation

In [169]:
X_train = train.drop('Loan_Status', axis=1)
y_train = train.Loan_Status

# Standardization

In [170]:
# Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.fit_transform(test)

# Model Building

## Linear Kernal

In [171]:
# Train SVM with Linear Kernel
svm_linear = SVC(kernel='linear', C=1)
svm_linear.fit(X_train_scaled, y_train)
y_pred_linear = svm_linear.predict(test_scaled)

In [172]:
# Model score
svm_linear.score(X_train_scaled, y_train)

0.8104575163398693

# Polynomial Kernal

In [173]:
# Train SVM with Polynomial Kernel
svm_poly = SVC(kernel='poly', degree=3, C=1)
svm_poly.fit(X_train_scaled, y_train)
y_pred_poly = svm_poly.predict(test_scaled)

In [174]:
svm_poly.score(X_train_scaled, y_train)

0.8120915032679739

## RBF Kernal

In [175]:
# Train SVM with RBF Kernel
svm_rbf = SVC(kernel='rbf', C=1, gamma='scale')
svm_rbf.fit(X_train_scaled, y_train)
y_pred_rbf = svm_rbf.predict(test_scaled)

In [176]:
svm_rbf.score(X_train_scaled, y_train)

0.8137254901960784