# Support Vector Machine. (SVM)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Loading the data and making the data frame.
train_data = pd.read_csv(r"D:\data science\Python - Anaconda\SVM\Risk Analytics\risk_analytics_train.csv",
                         index_col = 0, header = 0)
test_data = pd.read_csv(r"D:\data science\Python - Anaconda\SVM\Risk Analytics\risk_analytics_test.csv",
                        index_col = 0, header = 0)

In [3]:
# Testing if Data Frame is properly created or not
train_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001015,Male,Yes,0.0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
LP001022,Male,Yes,1.0,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
LP001031,Male,Yes,2.0,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
LP001035,Male,Yes,2.0,Graduate,No,2340,2546,100.0,360.0,,Urban
LP001051,Male,No,0.0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [5]:
# Finding the No. of Observation and variables in Training data 
train_data.shape

(614, 12)

In [6]:
# Finding the No. of Observation and variables in Testing data 
test_data.shape

(367, 11)

### Pre-Processing the Training Data set.

#### Feature Selection

In [7]:
# From given data and business domain knowledge we can say that all the variables are important in model building.
# Thus we will not remove any variable.

#### Handling the Missing values

##### For Training Data

In [8]:
# Finding the missing values in Training Data.
train_data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
# Now we try to impute all the null values of categorical variables present in training data.

colname1 = ["Gender","Married","Dependents","Self_Employed","Loan_Amount_Term"]

for i in colname1:
    train_data[i].fillna(train_data[i].mode()[0], inplace = True)

In [10]:
# Now we check if imputation is successful or not
train_data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
# Now we impute the numerical missing data with mean value.

train_data["LoanAmount"].fillna(train_data["LoanAmount"].mean(), inplace = True)

In [12]:
# Now we check if above imputation is successful or not.
train_data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
# Why do we have to treat "Credit_History" differently rather than other categorical variables is that
# According to the problem statement "Credit_History" represent if customer ever took a loan.
# so if a customer has taken loan in the past so that means he was eligible to receive the loan in the past.
# this fact increases the customer's odds to get the loan.

# but in case of missing values if we impute it with mode and mode value is "1". i.e
# all the customers who have missing value in "Credit_History" had received the loan in the past.
# and it increases his/her chances to get the loan now even if in reality customer has never taken loan before.

# so to avoid this business domain problem. 
# and not to increase people's chances in getting loan by missing values.
# we will impute all the missing values with "0". 
# As it doesn't provide advantage for customer to get loan.

In [14]:
# now we check the mode value for "Credit_History"
train_data["Credit_History"].mode()

0    1.0
dtype: float64

In [15]:
# "Credit_History" mode is "1" but by above business domain logic we will replace it with "0"

In [16]:
# Now we impute all the missing values in "Credit_History"
train_data["Credit_History"].fillna(value = 0, inplace = True)
# train_data["Credit_History"] = train_data["Credit_History"].fillna(value = 0)

In [17]:
# Now we check if imputation is successful or not
train_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [18]:
# so now that we have removed all the missing values from Training Data.

##### For Testing Data

In [19]:
# Now we check if testing data has any missing values.
test_data.isnull().sum()

Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [20]:
# Now we try to impute all the null values of categorical variables present in testing data.

colname2 = ["Gender","Dependents","Self_Employed","Loan_Amount_Term"]

for i in colname2:
    test_data[i].fillna(test_data[i].mode()[0], inplace = True)

In [21]:
# Now we check if imputation is successful or not.
test_data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      0
Credit_History       29
Property_Area         0
dtype: int64

In [22]:
# Now we try to impute the numerical missing data.
test_data["LoanAmount"].fillna(test_data["LoanAmount"].mean(), inplace = True)

In [23]:
# Now we check if imputation is successful or not.
test_data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       29
Property_Area         0
dtype: int64

In [24]:
# Now we impute the missing values of "Credit_History" with "0"
test_data["Credit_History"].fillna(value = 0, inplace = True)

In [25]:
# Now we check if imputation is successful or not.
test_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [26]:
# Thus now we have removed missing values from testing data.

#### Outlier Handling

In [27]:
# Support vector algorithm (SVM) does not get affected by outliers.
# SVM is robust to the outliers and it does not affect the output.
# thus we dont have to remove the outliers.

### Converting Categorical data into Numerical Data

#### For Training data

In [28]:
# First we try to find the data types of all variables in training data.
train_data.dtypes

Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [29]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

colname3 = ["Gender","Married","Education","Self_Employed","Property_Area","Loan_Status"]

for i in colname3:
    train_data[i] = le.fit_transform(train_data[i])

In [30]:
# Now to check if the conversion has happened or not.
train_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0.0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
LP001003,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
LP001005,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
LP001006,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
LP001008,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [31]:
# For "Loan_Status"
# Y --> 1 --> Eligible
# N --> 0 --> Not Eligible

#### For Testing Data

In [32]:
# First we try to find the data types of all variables in testing data.
test_data.dtypes

Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [33]:
colname4 = ["Gender","Married","Education","Self_Employed","Property_Area"]

for i in colname4:
    test_data[i] = le.fit_transform(test_data[i])

In [34]:
# Now to check if the conversion has happened or not.
test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001015,1,1,0.0,0,0,5720,0,110.0,360.0,1.0,2
LP001022,1,1,1.0,0,0,3076,1500,126.0,360.0,1.0,2
LP001031,1,1,2.0,0,0,5000,1800,208.0,360.0,1.0,2
LP001035,1,1,2.0,0,0,2340,2546,100.0,360.0,0.0,2
LP001051,1,0,0.0,1,0,3276,0,78.0,360.0,1.0,2


In [35]:
# Now we are done with the conversion of categorical data into numerical data.

### Create X & Y

In [36]:
X_train = train_data.values[:,:-1]
Y_train = train_data.values[:,-1]
Y_train = Y_train.astype(int)

In [37]:
X_test = test_data.values[:,:]

### Scaling the data

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
# Now we check if scaling has been done properly or not.
X_train

array([[ 0.47234264, -1.37208932, -0.73780632, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       [ 0.47234264,  0.72881553,  0.25346957, ...,  0.2732313 ,
         0.54095432, -1.31851281],
       [ 0.47234264,  0.72881553, -0.73780632, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       ...,
       [ 0.47234264,  0.72881553,  0.25346957, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       [ 0.47234264,  0.72881553,  1.24474546, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       [-2.11710719, -1.37208932, -0.73780632, ...,  0.2732313 ,
        -1.84858491, -0.04760721]])

In [40]:
X_test

array([[ 0.47234264,  0.72881553, -0.73780632, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       [ 0.47234264,  0.72881553,  0.25346957, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       [ 0.47234264,  0.72881553,  1.24474546, ...,  0.2732313 ,
         0.54095432,  1.22329839],
       ...,
       [ 0.47234264, -1.37208932, -0.73780632, ...,  0.2732313 ,
        -1.84858491, -0.04760721],
       [ 0.47234264,  0.72881553, -0.73780632, ...,  0.2732313 ,
         0.54095432, -1.31851281],
       [ 0.47234264, -1.37208932, -0.73780632, ..., -2.52283563,
         0.54095432, -1.31851281]])

### Building the Model

In [41]:
from sklearn.svm import SVC

svc_model = SVC(kernel="rbf", gamma=0.1, C=1.0)
svc_model.fit(X_train, Y_train)
Y_pred = svc_model.predict(X_test)
print(Y_pred)

[1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1
 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 0
 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1
 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1
 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1]


#### Now we create the Output Excel Sheet to provide to the client

In [42]:
new_test_data = pd.read_csv(r"D:\data science\Python - Anaconda\SVM\Risk Analytics\risk_analytics_test.csv",
                        index_col = 0, header = 0)

In [43]:
new_test_data["Y_Predictions"] = Y_pred
new_test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Y_Predictions
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001015,Male,Yes,0.0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1
LP001022,Male,Yes,1.0,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1
LP001031,Male,Yes,2.0,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,1
LP001035,Male,Yes,2.0,Graduate,No,2340,2546,100.0,360.0,,Urban,0
LP001051,Male,No,0.0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,1


In [44]:
# Now "Y_Predictions" are in 1 and 0.
# client wont understand the meaning or 1 and 0.
# Thus we have to convert the "0" and "1" something meaningful.

In [45]:
new_test_data["Y_Predictions"] = new_test_data["Y_Predictions"].map({0: "Not Eligible", 1: "Eligible"})

In [46]:
new_test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Y_Predictions
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001015,Male,Yes,0.0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,Eligible
LP001022,Male,Yes,1.0,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,Eligible
LP001031,Male,Yes,2.0,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,Eligible
LP001035,Male,Yes,2.0,Graduate,No,2340,2546,100.0,360.0,,Urban,Not Eligible
LP001051,Male,No,0.0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,Eligible


In [47]:
# Now we convert the data frame into excel sheet.
new_test_data.to_excel(r"D:\data science\Python - Anaconda\SVM\Risk Analytics\predicted_test_risk.xlsx", 
                       index = True, header = True)

In [48]:
# Thus we have created the output excel sheet which can be provided to the client.

In [49]:
# now to find how many customers were eligible or not.
new_test_data.Y_Predictions.value_counts()

Eligible        285
Not Eligible     82
Name: Y_Predictions, dtype: int64

### Evaluating the model

In [50]:
svc_model.score(X_train, Y_train)

0.7947882736156352

In [51]:
# thus from above we can say that model has accuracy of 79.47%

### Cross Validation

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# classifier = SVC(kernel = "rbf", gamma = 0.1, C = 1.0) #75.89%
classifier = KNeighborsClassifier(n_neighbors = 11, metric = "euclidean") # 75.07%
# classifier = SVC(kernel = "rbf", gamma = 0.001, C = 10.0) #77.03%
# classifier = LogisticRegression() # 77.20%
# classifier = DecisionTreeClassifier(min_samples_leaf=3, random_state=10)

# performing Kfold cross validation
from sklearn.model_selection import KFold
kfold_cv = KFold(n_splits = 10)
print(kfold_cv)

from sklearn.model_selection import cross_val_score
# running the model using scoring metric as accuracy
kfold_cv_result = cross_val_score(estimator=classifier, X=X_train, y=Y_train, cv=kfold_cv)

print(kfold_cv_result)
# finding the mean
print(kfold_cv_result.mean())

KFold(n_splits=10, random_state=None, shuffle=False)
[0.75806452 0.82258065 0.75806452 0.69354839 0.75409836 0.6557377
 0.75409836 0.7704918  0.7704918  0.7704918 ]
0.7507667900581703


## How to fit a model into file so you can give it do a developer?

In [53]:
import pickle

In [54]:
# save the model to the disk
filename = r"D:\data science\Python - Anaconda\SVM\Risk Analytics\svc_model.sav"
pickle.dump(svc_model, open(filename, "wb"))

## How to load the model from the disk?

In [55]:
# load the model from the disk
loaded_model = pickle.load(open(filename, "rb"))
Y_pred = loaded_model.predict(X_test)
Y_pred
# result = loaded_model.score(X_train, Y_train)
#print(result)

array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,