# Support Vector Machine (SVM)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('train_u6lujuX_CVtuZ9i (1).csv')

## Data cleaning

In [3]:
# check for missing vals
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
# drop all missing value
dataset = dataset.dropna()

In [5]:
# check again
dataset.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [6]:
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,480.0,480.0,480.0,480.0,480.0
mean,5364.23125,1581.093583,144.735417,342.05,0.854167
std,5668.251251,2617.692267,80.508164,65.212401,0.353307
min,150.0,0.0,9.0,36.0,0.0
25%,2898.75,0.0,100.0,360.0,1.0
50%,3859.0,1084.5,128.0,360.0,1.0
75%,5852.5,2253.25,170.0,360.0,1.0
max,81000.0,33837.0,600.0,480.0,1.0


## Label Encoding

In [7]:
# encoding loan status
dataset.replace({"Loan_Status":{'N':0,'Y':1}}, inplace=True)
dataset.replace({"Self_Employed":{'No':0,'Yes':1}}, inplace=True)
dataset.replace({"Married":{'No':0,'Yes':1}}, inplace=True)
dataset.replace({"Gender":{'Male':1,'Female':0}}, inplace=True)
dataset.replace({"Education":{'Graduate':1,'Not Graduate':0}}, inplace=True)
dataset.replace({'Property_Area':{'Rural':0,'Semiurban':1,'Urban':2}}, inplace=True)
# could've done it in one line but just to understand it better

In [8]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [9]:
# only the '3+' is the non integer value
dataset['Dependents'].value_counts()

Dependents
0     274
2      85
1      80
3+     41
Name: count, dtype: int64

In [10]:
# Encoding the dependents
dataset = dataset.replace(to_replace='3+', value=4)

## Splitting the dataset into the Training set and Test set

In [11]:
X = dataset.drop(columns=['Loan_ID','Loan_Status'])
y = dataset['Loan_Status']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 2)

In [13]:
print(X_train)

     Gender  Married Dependents  Education  Self_Employed  ApplicantIncome  \
391       1        1          4          1              0             9504   
207       1        0          0          0              0             3975   
2         1        1          0          1              1             3000   
58        1        1          1          1              0             8080   
276       1        1          0          1              0             3993   
..      ...      ...        ...        ...            ...              ...   
459       1        1          0          1              0             8334   
94        1        0          0          0              0             3620   
76        1        0          0          1              0             3750   
286       0        0          0          1              1             2600   
208       1        0          0          1              0             2479   

     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_Hi

In [14]:
print(y_train)

391    1
207    1
2      1
58     1
276    1
      ..
459    0
94     1
76     0
286    0
208    1
Name: Loan_Status, Length: 360, dtype: int64


In [15]:
print(X_test)

     Gender  Married Dependents  Education  Self_Employed  ApplicantIncome  \
356       1        1          2          1              0             8333   
159       1        1          0          1              0             4583   
300       1        1          0          0              0             1800   
442       1        0          4          0              0             4707   
253       1        1          1          0              0             2661   
..      ...      ...        ...        ...            ...              ...   
585       1        1          1          1              0             4283   
480       0        0          0          0              0             4350   
398       1        0          0          0              0             3902   
215       1        1          4          0              0             3850   
22        1        1          0          0              0             2600   

     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_Hi

In [16]:
print(y_test)

356    1
159    1
300    0
442    1
253    1
      ..
585    0
480    1
398    1
215    1
22     0
Name: Loan_Status, Length: 120, dtype: int64


## Feature Scaling

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
print(X_train)

[[ 0.46499055  0.72486118  2.56884302 ...  0.27707981  0.39223227
  -1.27625842]
 [ 0.46499055 -1.3795745  -0.71053105 ...  0.27707981  0.39223227
  -1.27625842]
 [ 0.46499055  0.72486118 -0.71053105 ...  0.27707981  0.39223227
   1.24820879]
 ...
 [ 0.46499055 -1.3795745  -0.71053105 ...  0.27707981  0.39223227
   1.24820879]
 [-2.15058132 -1.3795745  -0.71053105 ... -0.69135449  0.39223227
  -0.01402482]
 [ 0.46499055 -1.3795745  -0.71053105 ...  0.27707981  0.39223227
   1.24820879]]


In [19]:
print(X_test)

[[ 0.46499055  0.72486118  0.92915598 ...  0.27707981  0.39223227
  -1.27625842]
 [ 0.46499055  0.72486118 -0.71053105 ...  0.27707981  0.39223227
  -0.01402482]
 [ 0.46499055  0.72486118 -0.71053105 ...  0.27707981 -2.54950976
   1.24820879]
 ...
 [ 0.46499055 -1.3795745  -0.71053105 ...  0.27707981  0.39223227
  -1.27625842]
 [ 0.46499055  0.72486118  2.56884302 ...  0.27707981  0.39223227
  -0.01402482]
 [ 0.46499055  0.72486118 -0.71053105 ...  0.27707981 -2.54950976
  -0.01402482]]


## Training the SVM model on the Training set

In [20]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"accuracy of the test set: %{accuracy_score(y_pred, y_test) * 100}")


[[19 18]
 [ 3 80]]
accuracy of the test set: %82.5
