In [1]:
# Import libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn import decomposition

from flp_dual_svm_ls import FlpDualLSSVM

In [2]:
# Read dataset

dataset = pd.read_csv("heart.csv")
dataset

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# Data separation from tarjet variable

X = dataset.iloc[:, :dataset.shape[1] - 1]
y = dataset.iloc[:, dataset.shape[1] - 1]

In [4]:
# One hot encoding to variables cp and test_ecg

cp_dummies = pd.get_dummies(X.cp, prefix="cp")
X = pd.concat([X, cp_dummies], axis=1)
X = X.drop(["cp"], axis=1)

restecg_dummies = pd.get_dummies(X.restecg, prefix="restecg")
X = pd.concat([X, restecg_dummies], axis=1)
X = X.drop(["restecg"], axis=1)

X

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,slp,caa,thall,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
0,63,1,145,233,1,150,0,2.3,0,0,1,0,0,0,1,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,2,0,0,1,0,0,1,0
2,41,0,130,204,0,172,0,1.4,2,0,2,0,1,0,0,1,0,0
3,56,1,120,236,0,178,0,0.8,2,0,2,0,1,0,0,0,1,0
4,57,0,120,354,0,163,1,0.6,2,0,2,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,1,0,3,1,0,0,0,0,1,0
299,45,1,110,264,0,132,0,1.2,1,0,3,0,0,0,1,0,1,0
300,68,1,144,193,1,141,0,3.4,1,2,3,1,0,0,0,0,1,0
301,57,1,130,131,0,115,1,1.2,1,1,3,1,0,0,0,0,1,0


In [5]:
# Scaling data matrix

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.9521966 ,  0.68100522,  0.76395577, ...,  1.03015751,
        -1.00330579, -0.11566299],
       [-1.91531289,  0.68100522, -0.09273778, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [-1.47415758, -1.46841752, -0.09273778, ...,  1.03015751,
        -1.00330579, -0.11566299],
       ...,
       [ 1.50364073,  0.68100522,  0.70684287, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [ 0.29046364,  0.68100522, -0.09273778, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [ 0.29046364, -1.46841752, -0.09273778, ...,  1.03015751,
        -1.00330579, -0.11566299]])

In [6]:
# PCA application

pca = decomposition.PCA(n_components=10)
pca.fit(X)
print("Explained variance =", sum(pca.explained_variance_ratio_))

X = pca.transform(X)
X

Explained variance = 0.8083603217581656


array([[ 1.25553604,  2.65571067,  1.55807171, ..., -1.42075039,
        -0.53499617, -0.28448603],
       [-1.07638438, -0.93312063,  1.47398236, ...,  1.10866222,
        -1.7581646 ,  0.58030621],
       [-1.91922965,  1.57762633, -1.60605915, ...,  0.7407146 ,
        -0.87326308,  0.24630816],
       ...,
       [ 2.02291247, -1.50139988,  1.66371979, ...,  0.44751016,
        -0.41793094,  0.45030942],
       [ 1.48732588, -2.93400775, -0.26700887, ..., -0.24279765,
         0.14563255, -0.43569588],
       [-1.1246483 ,  1.92927186, -0.88011566, ...,  1.02149572,
        -0.81785933, -0.64405812]])

In [9]:
# y mapping to -1 and 1. Also, we extend de dimension of y

y = pd.Series(y).map({0: -1, 1: 1}).values
y = np.expand_dims(y, axis=1)
y

ValueError: Data must be 1-dimensional

In [10]:
# Train test split

def split_dataset(X, y, train_percentage):
    size_train = int(train_percentage * X.shape[0])

    indices = np.random.permutation(X.shape[0])
    training_idx, test_idx = indices[:size_train], indices[size_train:]
    X_train, X_test = X[training_idx,:], X[test_idx,:]
    y_train, y_test = y[training_idx,:], y[test_idx,:]
    return X_train, X_test, y_train, y_test


train_percentage = 0.7
X_train, X_test, y_train, y_test = split_dataset(X, y, train_percentage)

In [99]:
# Clean SVM training

for lamb in range(1, 20):
    for deg in range(1, 8):
        svm = FlpDualLSSVM(lr=1e-2, lambd=lamb, kernel="poly", degree=deg)
        svm.fit(X_train, y_train)

        print("lamb =", lamb)
        print("deg =", deg)
        print("Train score =", svm.score(X_train, y_train))
        print("Test score =", svm.score(X_test, y_test))
        print("---")

lamb = 1
deg = 1
Train score = 0.8632075471698113
Test score = 0.7692307692307693
---
lamb = 1
deg = 2
Train score = 0.6273584905660378
Test score = 0.5274725274725275
---
lamb = 1
deg = 3
Train score = 0.7264150943396226
Test score = 0.6263736263736264
---
lamb = 1
deg = 4
Train score = 0.8726415094339622
Test score = 0.8131868131868132
---
lamb = 1
deg = 5
Train score = 0.8726415094339622
Test score = 0.7912087912087912
---
lamb = 1
deg = 6
Train score = 0.8820754716981132
Test score = 0.7582417582417582
---
lamb = 1
deg = 7
Train score = 0.8915094339622641
Test score = 0.7912087912087912
---
lamb = 2
deg = 1
Train score = 0.8632075471698113
Test score = 0.7692307692307693
---
lamb = 2
deg = 2
Train score = 0.8584905660377359
Test score = 0.8131868131868132
---
lamb = 2
deg = 3
Train score = 0.7169811320754716
Test score = 0.6593406593406593
---
lamb = 2
deg = 4
Train score = 0.8915094339622641
Test score = 0.7472527472527473
---
lamb = 2
deg = 5
Train score = 0.8679245283018868
Test

In [33]:
svm = FlpDualLSSVM(lr=1e-2, lambd=3, kernel="linear")
svm.fit(X_train, y_train)
print("Train score =", svm.score(X_train, y_train))
print("Test score =", svm.score(X_test, y_test))

Train score = 0.8490566037735849
Test score = 0.7912087912087912
