# Clean parameter tunning

In [9]:
# Import libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.model_selection import KFold

from flp_dual_svm_ls import FlpDualLSSVM

In [10]:
# Read dataset

dataset = pd.read_csv("heart.csv")
dataset

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [11]:
# Data separation from tarjet variable

X = dataset.iloc[:, :dataset.shape[1] - 1]
y = dataset.iloc[:, dataset.shape[1] - 1]

## One-hot encoding

In [12]:
# One hot encoding to variables cp and test_ecg

cp_dummies = pd.get_dummies(X.cp, prefix="cp")
X = pd.concat([X, cp_dummies], axis=1)
X = X.drop(["cp"], axis=1)

restecg_dummies = pd.get_dummies(X.restecg, prefix="restecg")
X = pd.concat([X, restecg_dummies], axis=1)
X = X.drop(["restecg"], axis=1)

X

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,slp,caa,thall,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
0,63,1,145,233,1,150,0,2.3,0,0,1,0,0,0,1,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,2,0,0,1,0,0,1,0
2,41,0,130,204,0,172,0,1.4,2,0,2,0,1,0,0,1,0,0
3,56,1,120,236,0,178,0,0.8,2,0,2,0,1,0,0,0,1,0
4,57,0,120,354,0,163,1,0.6,2,0,2,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,1,0,3,1,0,0,0,0,1,0
299,45,1,110,264,0,132,0,1.2,1,0,3,0,0,0,1,0,1,0
300,68,1,144,193,1,141,0,3.4,1,2,3,1,0,0,0,0,1,0
301,57,1,130,131,0,115,1,1.2,1,1,3,1,0,0,0,0,1,0


## Scaling

In [13]:
# Scaling data matrix

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.9521966 ,  0.68100522,  0.76395577, ...,  1.03015751,
        -1.00330579, -0.11566299],
       [-1.91531289,  0.68100522, -0.09273778, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [-1.47415758, -1.46841752, -0.09273778, ...,  1.03015751,
        -1.00330579, -0.11566299],
       ...,
       [ 1.50364073,  0.68100522,  0.70684287, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [ 0.29046364,  0.68100522, -0.09273778, ..., -0.97072534,
         0.9967051 , -0.11566299],
       [ 0.29046364, -1.46841752, -0.09273778, ...,  1.03015751,
        -1.00330579, -0.11566299]])

## Dimensionality reduction

In [14]:
# PCA application

pca = decomposition.PCA(n_components=10)
pca.fit(X)
print("Explained variance =", sum(pca.explained_variance_ratio_))

X = pca.transform(X)
X

Explained variance = 0.8083603217581655


array([[ 1.25553604,  2.65571067,  1.55807171, ..., -1.42075039,
        -0.53499617, -0.28448603],
       [-1.07638438, -0.93312063,  1.47398236, ...,  1.10866222,
        -1.7581646 ,  0.58030621],
       [-1.91922965,  1.57762633, -1.60605915, ...,  0.7407146 ,
        -0.87326308,  0.24630816],
       ...,
       [ 2.02291247, -1.50139988,  1.66371979, ...,  0.44751016,
        -0.41793094,  0.45030942],
       [ 1.48732588, -2.93400775, -0.26700887, ..., -0.24279765,
         0.14563255, -0.43569588],
       [-1.1246483 ,  1.92927186, -0.88011566, ...,  1.02149572,
        -0.81785933, -0.64405812]])

In [15]:
# y mapping to -1 and 1. Also, we extend de dimension of y

y = pd.Series(y).map({0: -1, 1: 1}).values
y = np.expand_dims(y, axis=1)
y

array([[ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],

## Parameter selection

In [22]:
# Clean SVM training

lambda_values = [2, 4, 8, 16]
deg_values = [1, 2, 3, 4, 5]
 
for deg in deg_values:
    for lamb in lambda_values:
        kfold = KFold(n_splits=10)
        train_scores = []
        test_scores = []
        for train_idx, test_idx in kfold.split(X):
            X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx, :], y[test_idx, :]
            svm = FlpDualLSSVM(lr=1e-2, lambd=lamb, kernel="poly", degree=deg)
            svm.fit(X_train, y_train)
            train_scores.append(svm.score(X_train, y_train))
            test_scores.append(svm.score(X_test, y_test))

        print("lamb =", lamb)
        print("deg =", deg)
        print("Train mean =", np.mean(train_scores))
        print("Test mean =", np.mean(test_scores))
        print("---")

lamb = 2
deg = 1
Train mean = 0.8412087912087911
Test mean = 0.8010752688172044
---
lamb = 4
deg = 1
Train mean = 0.8415750915750916
Test mean = 0.7978494623655913
---
lamb = 8
deg = 1
Train mean = 0.839746283128636
Test mean = 0.8076344086021505
---
lamb = 16
deg = 1
Train mean = 0.8294925662572721
Test mean = 0.7980645161290323
---
lamb = 2
deg = 2
Train mean = 0.6853089312648136
Test mean = 0.6743010752688172
---
lamb = 4
deg = 2
Train mean = 0.6988849385908209
Test mean = 0.6409677419354838
---
lamb = 8
deg = 2
Train mean = 0.7129363283775049
Test mean = 0.6636559139784947
---
lamb = 16
deg = 2
Train mean = 0.6908707713854774
Test mean = 0.6305376344086022
---
lamb = 2
deg = 3
Train mean = 0.8151651045033397
Test mean = 0.7189247311827958
---
lamb = 4
deg = 3
Train mean = 0.7767197263520794
Test mean = 0.7156989247311829
---
lamb = 8
deg = 3
Train mean = 0.7946186166774402
Test mean = 0.7220430107526882
---
lamb = 16
deg = 3
Train mean = 0.7888022516698987
Test mean = 0.68182795698

In [None]:
# Clean SVM training

lambda_values = [2, 4, 8, 16]
deg_values = [1, 2, 3, 4, 5]
 
for deg in deg_values:
    for lamb in lambda_values:
        kfold = KFold(n_splits=10)
        train_scores = []
        test_scores = []
        for train_idx, test_idx in kfold.split(X):
            X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx, :], y[test_idx, :]
            svm = FlpDualLSSVM(lr=1e-2, lambd=lamb, kernel="poly", degree=deg)
            svm.fit(X_train, y_train)
            train_scores.append(svm.score(X_train, y_train))
            test_scores.append(svm.score(X_test, y_test))

        print("lamb =", lamb)
        print("deg =", deg)
        print("Train mean =", np.mean(train_scores))
        print("Test mean =", np.mean(test_scores))
        print("---")

In [19]:

kfold = KFold(n_splits=10)
train_scores = []
test_scores = []
for train_idx, test_idx in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx, :], y[test_idx, :]
    svm = FlpDualLSSVM(lr=1e-2, lambd=1, kernel="sigmoidal", max_iter=80, gamma=1/X_train.shape[1], r=0)
    svm.fit(X_train, y_train)
    train_scores.append(svm.score(X_train, y_train))
    test_scores.append(svm.score(X_test, y_test))

print("Train mean =", np.mean(train_scores))
print("Test mean =", np.mean(test_scores))
print("---")

Train mean = 0.7876562163326868
Test mean = 0.7458064516129033
---


### Selection test

This evidence shows that a linear kernel with max_iter = 80, lr = 1e-2 and lambda = 8 is a good selection of parameters.

# Dataset preparation for MPC

## Utilities

In [10]:
# Train test split for one train-test experiment 

def split_dataset(X, y, train_percentage):
    size_train = int(train_percentage * X.shape[0])

    indices = np.random.permutation(X.shape[0])
    training_idx, test_idx = indices[:size_train], indices[size_train:]
    X_train, X_test = X[training_idx,:], X[test_idx,:]
    y_train, y_test = y[training_idx,:], y[test_idx,:]
    return X_train, X_test, y_train, y_test


def select_subset(X, y, size_train):
    indices = np.random.permutation(size_train)
    X_subset = X[indices, :]
    y_subset = y[indices, :]
    return X_subset, y_subset


def save_dataset_csv(X, y, label):
    df_save = pd.DataFrame(data=np.append(X, y, axis=1))
    file_name = "real_dataset_" + label + ".csv"
    df_save.to_csv(file_name, index=False, columns=None)


def save_dataset_parties(X, y, n_parties):
    n_rows = X.shape[0]
    n_cols = X.shape[1]
    rows_per_party = n_rows // n_parties
    last_party = 0 
    if n_rows % n_parties != 0:
        last_party = rows_per_party + (n_rows % n_parties)
    else:
        last_party = rows_per_party
    
    party_info_X = []
    party_info_y = []
    for i in range(n_parties - 1):
        party_X_rows = []
        party_y_rows = []
        for j in range(rows_per_party):
            party_X_rows.append(X[j + i * rows_per_party].tolist())
            party_y_rows.append(y[j + i * rows_per_party][0])
        party_info_X.append(party_X_rows)
        party_info_y.append(party_y_rows)

    # Last party
    party_X_rows = []
    party_y_rows = []
    for j in range(last_party):
        party_X_rows.append(X[j + rows_per_party * (n_parties - 1)].tolist())
        party_y_rows.append(y[j + rows_per_party * (n_parties - 1)][0])
    party_info_X.append(party_X_rows)
    party_info_y.append(party_y_rows)

    for i in range(n_parties - 1):
        file_name = "Input-P" + str(i) + "-0"
        file = open(file_name, "w")
        file_str = ""
        for j in range(rows_per_party):
            for k in range(n_cols):
                file_str += str(party_info_X[i][j][k]) + " "
            file_str = file_str.strip()
            file_str += "\n"
        
        for j in range(rows_per_party):
            file_str += str(party_info_y[i][j]) + "\n"
        
        file.write(file_str)
        file.close()
    
    # Last party write
    file_name = "Input-P" + str(n_parties - 1) + "-0"
    file = open(file_name, "w")
    file_str = ""
    for j in range(last_party):
        for k in range(n_cols):
            file_str += str(party_info_X[n_parties - 1][j][k]) + " "
        file_str = file_str.strip()
        file_str += "\n"
    
    for j in range(last_party):
        file_str += str(party_info_y[n_parties - 1][j]) + "\n"
    
    file.write(file_str)
    file.close()

In [17]:
# Parameters
test_percentage = 0.3
n_parties = 4

# Data selection
train_percentage_temp = 1 - test_percentage
X_train, X_test, y_train, y_test = split_dataset(X, y, train_percentage_temp)

# Data saving to .csv
save_dataset_csv(X_train, y_train, "train")
save_dataset_csv(X_test, y_test, "test")

# Save data for MPC
save_dataset_parties(X_train, y_train, n_parties)

In [19]:
X_train.shape

(212, 10)