In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import csv
from sklearn import metrics

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
# read in data
df = pd.read_csv('../Data/Cluster0', delimiter=',')
print(df.head())

   Unnamed: 0  id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  \
0           1   1  20228       1     156    85.0    140     90            3   
1           2   2  18857       1     165    64.0    130     70            3   
2           6   9  22113       1     157    93.0    130     80            3   
3          22  32  23046       1     158    90.0    145     85            2   
4          24  35  16608       1     170    68.0    150     90            3   

   gluc  smoke  alco  active  cardio  
0     1      0     0       1       1  
1     1      0     0       0       1  
2     1      0     0       1       0  
3     2      0     0       1       1  
4     1      0     0       1       1  


In [3]:
df_feat = df.iloc[:,2:-1] # eventhing but last column
df_feat = df_feat.drop(columns=['smoke', 'alco'])

target = df.iloc[:,-1] #last column

df_feat.head() # age in days

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,active
0,20228,1,156,85.0,140,90,3,1,1
1,18857,1,165,64.0,130,70,3,1,0
2,22113,1,157,93.0,130,80,3,1,1
3,23046,1,158,90.0,145,85,2,2,1
4,16608,1,170,68.0,150,90,3,1,1


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([
    ('nonBinaryTransformer', StandardScaler(), ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc'])
], remainder='passthrough')

scaled_features = ct.fit_transform(df_feat)
print(scaled_features)

[[-0.23486806 -0.64100667  0.53482332 ... -0.46846662  1.
   1.        ]
 [-0.96390798  0.62270947 -0.84223205 ... -0.46846662  1.
   0.        ]
 [ 0.76749536 -0.50059377  1.05941583 ... -0.46846662  1.
   1.        ]
 ...
 [-0.03226834 -0.21976796 -0.31763953 ...  2.13462381  1.
   1.        ]
 [ 1.02699316 -0.36018086  3.22335997 ...  2.13462381  1.
   1.        ]
 [-0.06895963  1.32477399 -0.31763953 ... -0.46846662  1.
   1.        ]]


In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_features, target, train_size=0.7)

In [12]:
# Test all combinations of hyperparms including different kernels.
hyperParams_grid = {'C': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'rbf', 'sigmoid']}
 
grid = GridSearchCV(SVC(), hyperParams_grid, refit = True, verbose = 2)
 
# Fit all permutations on training set
grid.fit(x_train, y_train)
print(grid.best_params_)

# Predict using best model on test set
grid_predictions = grid.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

Fitting 5 folds for each of 315 candidates, totalling 1575 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.9s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.7s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END .....................C=0.1, gamma=1,

[CV] END ...................C=0.2, gamma=0.1, kernel=sigmoid; total time=   0.8s
[CV] END ...................C=0.2, gamma=0.1, kernel=sigmoid; total time=   0.7s
[CV] END ...................C=0.2, gamma=0.1, kernel=sigmoid; total time=   0.7s
[CV] END ...................C=0.2, gamma=0.1, kernel=sigmoid; total time=   0.7s
[CV] END ...................C=0.2, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END ...................C=0.2, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END ...................C=0.2, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END ...................C=0.2, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END ...................C=0.2, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END ......................C=0.2, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.2, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.2, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ...................

[CV] END .....................C=0.3, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .....................C=0.3, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .................C=0.3, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END .................C=0.3, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END .................C=0.3, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END .................C=0.3, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END .................C=0.3, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END .................C=0.3, gamma=0.0001, kernel=linear; total time=   0.4s
[CV] END .................C=0.3, gamma=0.0001, kernel=linear; total time=   0.4s
[CV] END .................C=0.3, gamma=0.0001, kernel=linear; total time=   0.4s
[CV] END .................C=0.3, gamma=0.0001, kernel=linear; total time=   0.4s
[CV] END .................C=0.3, gamma=0.0001, kernel=linear; total time=   0.4s
[CV] END ...................

[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   0.9s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   0.9s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .....................C=0.5, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END .....................C=0.5, gamma=1, kernel=sigmoid; total time=   1.0s
[CV] END .....................C=0.5, gamma=1, kernel=sigmoid; total time=   1.0s
[CV] END .....................C=0.5, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END .....................C=0.5, gamma=1, kernel=sigmoid; total time=   0.8s
[CV] END ....................C=0.5, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END ....................C=0.5, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ...................

[CV] END ...................C=0.6, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ...................C=0.6, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ...................C=0.6, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ......................C=0.6, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.6, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.6, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.6, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ......................C=0.6, gamma=0.01, kernel=rbf; total time=   0.8s
[CV] END ..................C=0.6, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=0.6, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=0.6, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=0.6, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ..................C

[CV] END .................C=0.7, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END .................C=0.7, gamma=0.0001, kernel=linear; total time=   0.5s
[CV] END .................C=0.7, gamma=0.0001, kernel=linear; total time=   0.5s
[CV] END .................C=0.7, gamma=0.0001, kernel=linear; total time=   0.5s
[CV] END .................C=0.7, gamma=0.0001, kernel=linear; total time=   0.5s
[CV] END .................C=0.7, gamma=0.0001, kernel=linear; total time=   0.5s
[CV] END ....................C=0.7, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ....................C=0.7, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ....................C=0.7, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ....................C=0.7, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ....................C=0.7, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ................C=0.7, gamma=0.0001, kernel=sigmoid; total time=   0.6s
[CV] END ................C=0

[CV] END .....................C=0.9, gamma=1, kernel=sigmoid; total time=   1.1s
[CV] END .....................C=0.9, gamma=1, kernel=sigmoid; total time=   1.1s
[CV] END .....................C=0.9, gamma=1, kernel=sigmoid; total time=   1.0s
[CV] END .....................C=0.9, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END ....................C=0.9, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ....................C=0.9, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ....................C=0.9, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ....................C=0.9, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ....................C=0.9, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END .......................C=0.9, gamma=0.1, kernel=rbf; total time=   0.8s
[CV] END .......................C=0.9, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .......................C=0.9, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END ...................

[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.9s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.9s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=   0.5s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=   0.6s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=   0.5s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=   0.6s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=   0.5s
[CV] END ...................

[CV] END ......................C=2, gamma=0.0001, kernel=rbf; total time=   0.9s
[CV] END ......................C=2, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV] END ......................C=2, gamma=0.0001, kernel=rbf; total time=   0.9s
[CV] END ......................C=2, gamma=0.0001, kernel=rbf; total time=   0.9s
[CV] END ......................C=2, gamma=0.0001, kernel=rbf; total time=   1.0s
[CV] END ..................C=2, gamma=0.0001, kernel=sigmoid; total time=   1.0s
[CV] END ..................C=2, gamma=0.0001, kernel=sigmoid; total time=   0.8s
[CV] END ..................C=2, gamma=0.0001, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=2, gamma=0.0001, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=2, gamma=0.0001, kernel=sigmoid; total time=   0.6s
[CV] END ........................C=3, gamma=1, kernel=linear; total time=   0.7s
[CV] END ........................C=3, gamma=1, kernel=linear; total time=   0.7s
[CV] END ...................

[CV] END ......................C=4, gamma=0.1, kernel=linear; total time=   0.7s
[CV] END ......................C=4, gamma=0.1, kernel=linear; total time=   0.7s
[CV] END ......................C=4, gamma=0.1, kernel=linear; total time=   0.7s
[CV] END .........................C=4, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .........................C=4, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .........................C=4, gamma=0.1, kernel=rbf; total time=   1.0s
[CV] END .........................C=4, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .........................C=4, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .....................C=4, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END .....................C=4, gamma=0.1, kernel=sigmoid; total time=   0.6s
[CV] END .....................C=4, gamma=0.1, kernel=sigmoid; total time=   0.6s
[CV] END .....................C=4, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ...................

[CV] END ....................C=5, gamma=0.01, kernel=sigmoid; total time=   0.6s
[CV] END ....................C=5, gamma=0.001, kernel=linear; total time=   0.8s
[CV] END ....................C=5, gamma=0.001, kernel=linear; total time=   0.7s
[CV] END ....................C=5, gamma=0.001, kernel=linear; total time=   0.8s
[CV] END ....................C=5, gamma=0.001, kernel=linear; total time=   0.9s
[CV] END ....................C=5, gamma=0.001, kernel=linear; total time=   0.8s
[CV] END .......................C=5, gamma=0.001, kernel=rbf; total time=   0.9s
[CV] END .......................C=5, gamma=0.001, kernel=rbf; total time=   0.9s
[CV] END .......................C=5, gamma=0.001, kernel=rbf; total time=   0.9s
[CV] END .......................C=5, gamma=0.001, kernel=rbf; total time=   0.9s
[CV] END .......................C=5, gamma=0.001, kernel=rbf; total time=   0.9s
[CV] END ...................C=5, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END ...................

[CV] END ..................C=6, gamma=0.0001, kernel=sigmoid; total time=   0.7s
[CV] END ..................C=6, gamma=0.0001, kernel=sigmoid; total time=   0.6s
[CV] END ..................C=6, gamma=0.0001, kernel=sigmoid; total time=   0.7s
[CV] END ..................C=6, gamma=0.0001, kernel=sigmoid; total time=   0.9s
[CV] END ........................C=7, gamma=1, kernel=linear; total time=   0.9s
[CV] END ........................C=7, gamma=1, kernel=linear; total time=   0.9s
[CV] END ........................C=7, gamma=1, kernel=linear; total time=   0.8s
[CV] END ........................C=7, gamma=1, kernel=linear; total time=   0.9s
[CV] END ........................C=7, gamma=1, kernel=linear; total time=   0.9s
[CV] END ...........................C=7, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=7, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...........................C=7, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ...................

[CV] END .........................C=8, gamma=0.1, kernel=rbf; total time=   1.5s
[CV] END .........................C=8, gamma=0.1, kernel=rbf; total time=   1.0s
[CV] END .....................C=8, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END .....................C=8, gamma=0.1, kernel=sigmoid; total time=   0.6s
[CV] END .....................C=8, gamma=0.1, kernel=sigmoid; total time=   0.6s
[CV] END .....................C=8, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END .....................C=8, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END .....................C=8, gamma=0.01, kernel=linear; total time=   0.9s
[CV] END .....................C=8, gamma=0.01, kernel=linear; total time=   0.8s
[CV] END .....................C=8, gamma=0.01, kernel=linear; total time=   0.7s
[CV] END .....................C=8, gamma=0.01, kernel=linear; total time=   0.8s
[CV] END .....................C=8, gamma=0.01, kernel=linear; total time=   0.7s
[CV] END ...................

[CV] END .......................C=9, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .......................C=9, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .......................C=9, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .......................C=9, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END .......................C=9, gamma=0.001, kernel=rbf; total time=   0.8s
[CV] END ...................C=9, gamma=0.001, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=9, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=9, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=9, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=9, gamma=0.001, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=9, gamma=0.0001, kernel=linear; total time=   0.9s
[CV] END ...................C=9, gamma=0.0001, kernel=linear; total time=   0.9s
[CV] END ...................

[CV] END ......................C=100, gamma=1, kernel=linear; total time=  23.7s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  18.5s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=   3.7s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   1.7s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   1.5s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   1.5s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   1.4s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   1.5s
[CV] END .....................C=100, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END .....................C=100, gamma=1, kernel=sigmoid; total time=   1.0s
[CV] END .....................C=100, gamma=1, kernel=sigmoid; total time=   1.0s
[CV] END .....................C=100, gamma=1, kernel=sigmoid; total time=   0.9s
[CV] END ...................

[CV] END ..................C=1000, gamma=0.1, kernel=sigmoid; total time=   0.7s
[CV] END ..................C=1000, gamma=0.01, kernel=linear; total time=  25.1s
[CV] END ..................C=1000, gamma=0.01, kernel=linear; total time=  54.3s
[CV] END ..................C=1000, gamma=0.01, kernel=linear; total time= 1.8min
[CV] END ..................C=1000, gamma=0.01, kernel=linear; total time=  37.7s
[CV] END ..................C=1000, gamma=0.01, kernel=linear; total time=  35.9s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   5.1s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   5.6s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   4.9s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   4.4s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   4.3s
[CV] END .................C=1000, gamma=0.01, kernel=sigmoid; total time=   0.5s
[CV] END .................C=

In [None]:
# from copy - specific cluster gridsearch test (original - cluster 0)

hyperParams_grid = {'C': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'rbf', 'sigmoid']}

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

k=0

path_k = "../Data/Cluster" + str(k)

df = pd.read_csv(path_k, delimiter=',')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

grid = GridSearchCV(SVC(), hyperParams_grid, refit = True)
 
# Fit all permutations on training set
grid.fit(X_train, y_train)
print(grid.best_params_)

# Predict using best model on test set
grid_predictions = grid.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

In [None]:
# from copy - specific cluster gridsearch test (target cluster 7)

hyperParams_grid = {'C': [.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'rbf', 'sigmoid']}

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

k=7

path_k = "../Data/Cluster" + str(k)

df = pd.read_csv(path_k, delimiter=',')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

grid = GridSearchCV(SVC(), hyperParams_grid, refit = True, n_jobs = -1)
 
# Fit all permutations on training set
grid.fit(X_train, y_train)
print(grid.best_params_)

# Predict using best model on test set
grid_predictions = grid.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

# 0:{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

k=0

path_k = "../Data/Cluster" + str(k)

df = pd.read_csv(path_k, delimiter=',')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7757054041128647
              precision    recall  f1-score   support

           0       0.67      0.41      0.51       592
           1       0.80      0.92      0.85      1499

    accuracy                           0.78      2091
   macro avg       0.73      0.66      0.68      2091
weighted avg       0.76      0.78      0.76      2091



In [7]:
# for cluster _
def log_k(k):
    path_k = "../Data/Cluster" + str(k)

    df = pd.read_csv(path_k, delimiter=',')
    df = df.iloc[:,2:] # drop original/new ids

    X = df.iloc[:,:-1] # everything but last column
    y = df.iloc[:,-1] #last column

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(X)
    X = pd.DataFrame(scaled_features)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
    svclassifier.fit(X_train, y_train)
    y_pred = svclassifier.predict(X_test)

    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return svclassifier.score(X_test, y_test)

In [8]:
acc_list = []
for k in range(9):
    print(k)
    acc_list.append(log_k(k))
from statistics import mean
print(acc_list, mean(acc_list),sep="\n")

0
Accuracy: 0.7757054041128647
              precision    recall  f1-score   support

           0       0.67      0.41      0.51       592
           1       0.80      0.92      0.85      1499

    accuracy                           0.78      2091
   macro avg       0.73      0.66      0.68      2091
weighted avg       0.76      0.78      0.76      2091

1
Accuracy: 0.8101010101010101
              precision    recall  f1-score   support

           0       0.81      0.95      0.87      2413
           1       0.81      0.49      0.61      1052

    accuracy                           0.81      3465
   macro avg       0.81      0.72      0.74      3465
weighted avg       0.81      0.81      0.79      3465

2
Accuracy: 0.7357259380097879
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       674
           1       0.72      0.67      0.70       552

    accuracy                           0.74      1226
   macro avg       0.73      0.73  

In [9]:
log_acc = [0.7680535628885701,
 0.7948051948051948,
 0.7406199021207178,
 0.6933245208195637,
 0.8243727598566308,
 0.7364620938628159,
 0.706140350877193,
 0.6985793699814701,
 0.6788370520622042]
import numpy as np
np.subtract(acc_list,log_acc)

array([ 0.00765184,  0.01529582, -0.00489396, -0.00594845, -0.00716846,
        0.00090253,  0.00740132,  0.00216183, -0.00631057])

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

k=8

path_k = "../Data/Cluster" + str(k)

df = pd.read_csv(path_k, delimiter=',')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
kernels = ['linear', 'rbf', 'sigmoid']
# Cs = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000]
# gammas = [1, 0.1, 0.01, 0.001, 0.0001]

for kern in kernels:
    svclassifier = SVC(kernel=kern,gamma=gam)
    svclassifier.fit(X_train, y_train)
    y_pred = svclassifier.predict(X_test)
    print(kern)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


Accuracy: 0.6716249718278116
              precision    recall  f1-score   support

           0       0.62      0.81      0.70      2088
           1       0.76      0.55      0.64      2349

    accuracy                           0.67      4437
   macro avg       0.69      0.68      0.67      4437
weighted avg       0.69      0.67      0.67      4437

Accuracy: 0.670272706783863
              precision    recall  f1-score   support

           0       0.62      0.76      0.68      2088
           1       0.73      0.59      0.65      2349

    accuracy                           0.67      4437
   macro avg       0.68      0.68      0.67      4437
weighted avg       0.68      0.67      0.67      4437

Accuracy: 0.5954473743520396
              precision    recall  f1-score   support

           0       0.56      0.61      0.59      2088
           1       0.63      0.58      0.60      2349

    accuracy                           0.60      4437
   macro avg       0.60      0.60      0.6

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

k=8

path_k = "../Data/Cluster" + str(k)

df = pd.read_csv(path_k, delimiter=',')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
kernels = ['linear', 'rbf', 'sigmoid']
# Cs = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000]
gammas = [1, 0.1, 0.01, 0.001, 0.0001]

best_acc = 0
for kern in kernels:
    for gam in gammas:
        svclassifier = SVC(kernel=kern,gamma=gam)
        svclassifier.fit(X_train, y_train)
        y_pred = svclassifier.predict(X_test)
        if metrics.accuracy_score(y_test, y_pred) > best_acc:
            best_acc = metrics.accuracy_score(y_test, y_pred)
            print(kern, gam)
            print("Accuracy:", best_acc)
            print(classification_report(y_test, y_pred))


linear 1
Accuracy: 0.6716249718278116
              precision    recall  f1-score   support

           0       0.62      0.81      0.70      2088
           1       0.76      0.55      0.64      2349

    accuracy                           0.67      4437
   macro avg       0.69      0.68      0.67      4437
weighted avg       0.69      0.67      0.67      4437

rbf 0.01
Accuracy: 0.6741041244083841
              precision    recall  f1-score   support

           0       0.62      0.79      0.70      2088
           1       0.75      0.57      0.65      2349

    accuracy                           0.67      4437
   macro avg       0.69      0.68      0.67      4437
weighted avg       0.69      0.67      0.67      4437



In [25]:
# Entire dataset

df = pd.read_csv('../Data/cardio_train.csv', delimiter=';')
df = df.iloc[:,2:] # drop original/new ids

X = df.iloc[:,:-1] # everything but last column
y = df.iloc[:,-1] #last column

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.718
              precision    recall  f1-score   support

           0       0.69      0.81      0.74     10539
           1       0.76      0.63      0.69     10461

    accuracy                           0.72     21000
   macro avg       0.73      0.72      0.72     21000
weighted avg       0.72      0.72      0.72     21000



In [11]:
# Save models

import pickle

svm_best_clusters = [0, 1, 2, 3, 6, 7]

for k in svm_best_clusters:
    inputPath = "../Data/Cluster" + str(k)
    outputPath = "../Final Models/Cluster" + str(k) + "_SVM.pkl"
    
    df = pd.read_csv(inputPath, delimiter=',')
    df = df.iloc[:,2:] # drop original/new ids

    X = df.iloc[:,:-1] # everything but last column
    y = df.iloc[:,-1] #last column

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(X)
    X = pd.DataFrame(scaled_features)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    svclassifier = SVC(kernel='rbf',C=100, gamma=0.01)
    svclassifier.fit(X_train, y_train)
    
    # Write model to file
    pickle.dump(svclassifier, open(outputPath, 'wb'))
    
    # Ensure model is being stored properly
    savedModel = pickle.load(open(outputPath, 'rb'))
    print(savedModel.score(X_test, y_test))

0.7757054041128647
0.8101010101010101
0.7357259380097879
0.6873760740251157
0.7135416666666666
0.7007411982705374
