# Clean parameter tunning

In [8]:
# Import libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.model_selection import KFold

from flp_dual_svm_ls import FlpDualLSSVM

In [9]:
# Read dataset

dataset = pd.read_csv("datasets/heart.csv")
dataset

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [10]:
# Data separation from tarjet variable

X = dataset.iloc[:, :dataset.shape[1] - 1]
y = dataset.iloc[:, dataset.shape[1] - 1]

## One-hot encoding

In [11]:
# One hot encoding to variables cp and test_ecg

cp_dummies = pd.get_dummies(X.cp, prefix="cp")
X = pd.concat([X, cp_dummies], axis=1)
X = X.drop(["cp"], axis=1)

restecg_dummies = pd.get_dummies(X.restecg, prefix="restecg")
X = pd.concat([X, restecg_dummies], axis=1)
X = X.drop(["restecg"], axis=1)

slp_dummies = pd.get_dummies(X.slp, prefix="slp")
X = pd.concat([X, slp_dummies], axis=1)
X = X.drop(["slp"], axis=1)

thall_dummies = pd.get_dummies(X.thall, prefix="thall")
X = pd.concat([X, thall_dummies], axis=1)
X = X.drop(["thall"], axis=1)

X

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,caa,cp_0,...,restecg_0,restecg_1,restecg_2,slp_0,slp_1,slp_2,thall_0,thall_1,thall_2,thall_3
0,63,1,145,233,1,150,0,2.3,0,0,...,1,0,0,1,0,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,...,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,0,0,...,1,0,0,0,0,1,0,0,1,0
3,56,1,120,236,0,178,0,0.8,0,0,...,0,1,0,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,0,1,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,0,1,...,0,1,0,0,1,0,0,0,0,1
299,45,1,110,264,0,132,0,1.2,0,0,...,0,1,0,0,1,0,0,0,0,1
300,68,1,144,193,1,141,0,3.4,2,1,...,0,1,0,0,1,0,0,0,0,1
301,57,1,130,131,0,115,1,1.2,1,1,...,0,1,0,0,1,0,0,0,0,1


## Scaling

In [12]:
# Scaling data matrix

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.9521966 ,  0.68100522,  0.76395577, ...,  3.97911213,
        -1.10076284, -0.79311554],
       [-1.91531289,  0.68100522, -0.09273778, ..., -0.25131234,
         0.9084609 , -0.79311554],
       [-1.47415758, -1.46841752, -0.09273778, ..., -0.25131234,
         0.9084609 , -0.79311554],
       ...,
       [ 1.50364073,  0.68100522,  0.70684287, ..., -0.25131234,
        -1.10076284,  1.26085034],
       [ 0.29046364,  0.68100522, -0.09273778, ..., -0.25131234,
        -1.10076284,  1.26085034],
       [ 0.29046364, -1.46841752, -0.09273778, ..., -0.25131234,
         0.9084609 , -0.79311554]])

## Dimensionality reduction

In [13]:
# PCA application

pca = decomposition.PCA(n_components=14)
pca.fit(X)
print("Explained variance =", sum(pca.explained_variance_ratio_))

X = pca.transform(X)
X

Explained variance = 0.8831147319297772


array([[ 1.59935937,  2.27084405, -1.6483641 , ..., -1.12343698,
        -0.76456805,  1.5849352 ],
       [-1.45620051, -0.66850965,  0.48936168, ..., -0.29633124,
        -0.03418751,  1.15907944],
       [-2.61612244,  1.36498586, -0.76279887, ...,  0.45085843,
         0.39423145, -0.78109535],
       ...,
       [ 2.80278142, -1.46042039,  0.14376551, ...,  0.84401062,
         1.3468743 , -0.4030251 ],
       [ 2.46051197, -2.7806753 ,  0.43588033, ...,  0.81287506,
         0.07189738, -1.67941185],
       [-1.18794755,  1.91801721,  0.73819437, ...,  0.68789131,
         0.42197561, -0.4677488 ]])

In [14]:
# y mapping to 0 and 1. Also, we extend de dimension of y
y = pd.Series(y).map({0: -1, 1: 1}).values
y = np.expand_dims(y, axis=1)
y

array([[ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],

In [26]:

kfold = KFold(n_splits=10)
train_scores = []
test_scores = []
for train_idx, test_idx in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx, :], y[test_idx, :]
    svm = FlpDualLSSVM(lr=1e-2, lambd=1, kernel="sigmoidal", max_iter=80, gamma=1/X_train.shape[1], r=0)
    svm.fit(X_train, y_train)
    train_scores.append(svm.score(X_train, y_train))
    test_scores.append(svm.score(X_test, y_test))

print("Train mean =", np.mean(train_scores))
print("Test mean =", np.mean(test_scores))
print("---")

Train mean = 0.8052682611506141
Test mean = 0.7786021505376345
---


## Parameter selection

In [27]:
# Clean SVM training

lambda_values = [2, 4, 8, 16]
deg_values = [1, 2, 3, 4, 5]
deg_values = [1, 2]
 
for deg in deg_values:
    for lamb in lambda_values:
        kfold = KFold(n_splits=10)
        train_scores = []
        test_scores = []
        for train_idx, test_idx in kfold.split(X):
            X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx, :], y[test_idx, :]
            svm = FlpDualLSSVM(lr=1e-2, lambd=lamb, kernel="poly", degree=deg)
            svm.fit(X_train, y_train)
            train_scores.append(svm.score(X_train, y_train))
            test_scores.append(svm.score(X_test, y_test))

        print("lamb =", lamb)
        print("deg =", deg)
        print("Train mean =", np.mean(train_scores))
        print("Test mean =", np.mean(test_scores))
        print("---")

lamb = 2
deg = 1
Train mean = 0.8503757272139627
Test mean = 0.801505376344086
---


KeyboardInterrupt: 

### Selection test

This evidence shows that a linear kernel with max_iter = 80, lr = 1e-2, degree = 3 and lambda = 2 is a good selection of parameters.

# Dataset preparation for MPC

## Utilities

In [15]:
# Train test split for one train-test experiment 

def split_dataset(X, y, train_percentage):
    size_train = int(train_percentage * X.shape[0])

    indices = np.random.permutation(X.shape[0])
    training_idx, test_idx = indices[:size_train], indices[size_train:]
    X_train, X_test = X[training_idx,:], X[test_idx,:]
    y_train, y_test = y[training_idx,:], y[test_idx,:]
    return X_train, X_test, y_train, y_test


def select_subset(X, y, size_train):
    indices = np.random.permutation(size_train)
    X_subset = X[indices, :]
    y_subset = y[indices, :]
    return X_subset, y_subset


def save_dataset_csv(X, y, label):
    df_save = pd.DataFrame(data=np.append(X, y, axis=1))
    file_name = "real_dataset_" + label + ".csv"
    df_save.to_csv(file_name, index=False, columns=None)


def save_dataset_parties(X, y, n_parties):
    n_rows = X.shape[0]
    n_cols = X.shape[1]
    rows_per_party = n_rows // n_parties
    last_party = 0 
    if n_rows % n_parties != 0:
        last_party = rows_per_party + (n_rows % n_parties)
    else:
        last_party = rows_per_party
    
    party_info_X = []
    party_info_y = []
    for i in range(n_parties - 1):
        party_X_rows = []
        party_y_rows = []
        for j in range(rows_per_party):
            party_X_rows.append(X[j + i * rows_per_party].tolist())
            party_y_rows.append(y[j + i * rows_per_party][0])
        party_info_X.append(party_X_rows)
        party_info_y.append(party_y_rows)

    # Last party
    party_X_rows = []
    party_y_rows = []
    for j in range(last_party):
        party_X_rows.append(X[j + rows_per_party * (n_parties - 1)].tolist())
        party_y_rows.append(y[j + rows_per_party * (n_parties - 1)][0])
    party_info_X.append(party_X_rows)
    party_info_y.append(party_y_rows)

    for i in range(n_parties - 1):
        file_name = "Input-P" + str(i) + "-0"
        file = open(file_name, "w")
        file_str = ""
        for j in range(rows_per_party):
            for k in range(n_cols):
                file_str += str(party_info_X[i][j][k]) + " "
            file_str = file_str.strip()
            file_str += "\n"
        
        for j in range(rows_per_party):
            file_str += str(party_info_y[i][j]) + "\n"
        
        file.write(file_str)
        file.close()
    
    # Last party write
    file_name = "Input-P" + str(n_parties - 1) + "-0"
    file = open(file_name, "w")
    file_str = ""
    for j in range(last_party):
        for k in range(n_cols):
            file_str += str(party_info_X[n_parties - 1][j][k]) + " "
        file_str = file_str.strip()
        file_str += "\n"
    
    for j in range(last_party):
        file_str += str(party_info_y[n_parties - 1][j]) + "\n"
    
    file.write(file_str)
    file.close()

In [16]:
# Parameters
test_percentage = 0.3
n_parties = 4

# Data selection
train_percentage_temp = 1 - test_percentage
X_train, X_test, y_train, y_test = split_dataset(X, y, train_percentage_temp)

# Data saving to .csv
save_dataset_csv(X_train, y_train, "train")
save_dataset_csv(X_test, y_test, "test")

# Save data for MPC
save_dataset_parties(X_train, y_train, n_parties)

In [17]:
X_train.shape

(212, 14)