In [135]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
data=pd.read_csv("/content/kidney_disease.csv")  #this csv filed was downloaded from kaggle, as the .arff file (in original link) doesn't load properly because of the typos.

In the dataset, there were a lot of typos, so .arff doesn't load, however .csv load despite of typos. Here is code for correcting most of typos in the dataset

In [136]:
for i in range(data.shape[0]):
    if data.iloc[i,25]=='ckd\t':
        data.iloc[i,25]='ckd'
    if data.iloc[i,20] in [' yes','\tyes']:
        data.iloc[i,20]='yes'
    if data.iloc[i,20]=='\tno':
        data.iloc[i,20]='no'
    if data.iloc[i,21]=='\tno':
        data.iloc[i,21]='no'
    if data.iloc[i,16]=='\t?':
        data.iloc[i,16]=np.nan
    if data.iloc[i,16]=='\t43':
        data.iloc[i,16]='43'
    if data.iloc[i,17]=='\t?':
        data.iloc[i,17]=np.nan
    if data.iloc[i,17]=='\t6200':
        data.iloc[i,17]= '6200'
    if data.iloc[i,17]=='\t8400':
        data.iloc[i,17]= '6200'
    if data.iloc[i,18]=='\t?':
        data.iloc[i,18]=np.nan
    if data.iloc[i,25]=='ckd':
        data.iloc[i,25]='yes'
    if data.iloc[i,25]=='notckd':
        data.iloc[i,25]='no'

In [137]:
from sklearn.preprocessing import LabelEncoder
for i in data.columns:
    if data[i].dtype=='object':
        data[i]=LabelEncoder().fit_transform(data[i])


y=data['classification']
X=data.drop(['classification'],axis=1)

class_counts = y.value_counts()
class_counts

1    250
0    150
Name: classification, dtype: int64

Here, we see that our classes are a little bit unbalanced, 62.5% to 37.5%

The below code changes the missing values to the median. This is needed, as the SVM classifier in sklearn doesn't hanlde those missing values, and will reslut in error

---



In [138]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')  # You can choose a different strategy based on your needs
X = imputer.fit_transform(X)

Here I used sklearn StandardScaler function. That standardizes all features using the z = (x - u) / s formula. u-mean s-standard deviation

In [139]:
from sklearn.preprocessing import StandardScaler
X=StandardScaler().fit_transform(X)

Here I divided the dataset to train validation test sets with ratios 80% 10% 10% respectively. As we have small amount of data samples, the training data should be as large as possible, so I gaved minimal 10% splits to validation and test data.

In [140]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Train set size: 320
Validation set size: 40
Test set size: 40


Sklearn's precision_score and recall_score functions uses the same formula as in the pdf file, so I used them

#### I used 4 candidate values for C, and in the following code we train model I with those different C values. Here also we calculate the error rate, precision and recall for training data

In [141]:
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("model I")
C_values = [0.1, 1, 10, 100]
average_option = 'micro'  # this is needed, as we have binary classification
# Train and evaluate SVM models for different C values
for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='linear', C=C_value)  # here the SVC is for Classification
    model.fit(X_train, y_train)

    # Make predictions on the training set
    y_pred_train = model.predict(X_train)

    # Calculate accuracy on the training set
    error_rate = 1 - accuracy_score(y_train, y_pred_train)

    # (b) Precision and recall
    precision = precision_score(y_train, y_pred_train,average=average_option)
    recall = recall_score(y_train, y_pred_train,average=average_option)

    n_support_vectors = model.n_support_

    print(f"For C = {C_value}:")

    print(f"  (a) Error rate: {error_rate:.2%}")
    print(f"  (b) Precision: {precision:.2f}")
    print(f"  (b)  Recall: {recall:.2f}")
    print(f"  (c) Number of Support Vectors for Class 0: {n_support_vectors[0]}")
    print(f"  (c) Number of Support Vectors for Class 1: {n_support_vectors[1]}")

model I
For C = 0.1:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 17
  (c) Number of Support Vectors for Class 1: 17
For C = 1:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 6
  (c) Number of Support Vectors for Class 1: 14
For C = 10:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 6
  (c) Number of Support Vectors for Class 1: 14
For C = 100:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 6
  (c) Number of Support Vectors for Class 1: 14


For every C value model I got error rate of 0% for training data. Let's see the results for validation data

In [142]:
C_values = [0.1, 1, 10, 100]
results = []

for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='linear', C=C_value)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    error_rate = 1 - accuracy_score(y_val, y_pred_val)

    precision = precision_score(y_val, y_pred_val,average=average_option)
    recall = recall_score(y_val, y_pred_val,average=average_option)

    beta = 1  # You can adjust the value of beta
    f_beta = (1 + beta**2) * (precision * recall) / (beta**2 * (precision + recall))

    results.append({
        'C_value': C_value,
        'Error Rate': error_rate,
        'Precision': precision,
        'Recall': recall,
        'F1': f_beta })

results_df = pd.DataFrame(results)
print(results_df)

   C_value  Error Rate  Precision  Recall   F1
0      0.1         0.0        1.0     1.0  1.0
1      1.0         0.0        1.0     1.0  1.0
2     10.0         0.0        1.0     1.0  1.0
3    100.0         0.0        1.0     1.0  1.0


So, every C_value got perfect results on validation data set, that's impressive.

### Let's check using model II (polynomial kernel)


In [143]:
print("model II")
C_values = [0.1, 1, 10, 100]
# Train and evaluate SVM models for different C values
for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='poly', C=C_value)  # kernel="poly" is the model II
    model.fit(X_train, y_train)

    # Make predictions on the training set
    y_pred_train = model.predict(X_train)

    # Calculate accuracy on the training set
    error_rate = 1 - accuracy_score(y_train, y_pred_train)

    # (b) Precision and recall
    precision = precision_score(y_train, y_pred_train,average=average_option)
    recall = recall_score(y_train, y_pred_train,average=average_option)

    n_support_vectors = model.n_support_

    print(f"For C = {C_value}:")

    print(f"  (a) Error rate: {error_rate:.2%}")
    print(f"  (b) Precision: {precision:.2f}")
    print(f"  (b)  Recall: {recall:.2f}")
    print(f"  (c) Number of Support Vectors for Class 0: {n_support_vectors[0]}")
    print(f"  (c) Number of Support Vectors for Class 1: {n_support_vectors[1]}")

model II
For C = 0.1:
  (a) Error rate: 35.31%
  (b) Precision: 0.65
  (b)  Recall: 0.65
  (c) Number of Support Vectors for Class 0: 119
  (c) Number of Support Vectors for Class 1: 131
For C = 1:
  (a) Error rate: 0.31%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 76
  (c) Number of Support Vectors for Class 1: 107
For C = 10:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 27
  (c) Number of Support Vectors for Class 1: 77
For C = 100:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 20
  (c) Number of Support Vectors for Class 1: 75


So the model II with C=0.1 got error rate of 35.31%, however with higher C values got almost perfect results on Training data, let's check results on validation data

In [144]:
C_values = [0.1, 1, 10, 100]
results = []

for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='poly', C=C_value)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    error_rate = 1 - accuracy_score(y_val, y_pred_val)

    precision = precision_score(y_val, y_pred_val,average=average_option)
    recall = recall_score(y_val, y_pred_val,average=average_option)

    beta = 1  # You can adjust the value of beta
    f_beta = (1 + beta**2) * (precision * recall) / (beta**2 * (precision + recall))

    results.append({
        'C_value': C_value,
        'Error Rate': error_rate,
        'Precision': precision,
        'Recall': recall,
        'F1': f_beta })
print("model II")
results_df = pd.DataFrame(results)
print(results_df)

model II
   C_value  Error Rate  Precision  Recall     F1
0      0.1       0.375      0.625   0.625  0.625
1      1.0       0.000      1.000   1.000  1.000
2     10.0       0.000      1.000   1.000  1.000
3    100.0       0.000      1.000   1.000  1.000


So, model II got perfect results on validation data for C values 1,10,100. However for C=0.1 got error rate of 37.5%

#### Now we train model III (radial basis function kernel)


In [145]:
print("model III")
C_values = [0.1, 1, 10, 100]
average_option = 'micro'
# Train and evaluate SVM models for different C values
for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='rbf', C=C_value)  # kernel="rbf" is the model III
    model.fit(X_train, y_train)

    # Make predictions on the training set
    y_pred_train = model.predict(X_train)

    # Calculate accuracy on the training set
    error_rate = 1 - accuracy_score(y_train, y_pred_train)

    # (b) Precision and recall
    precision = precision_score(y_train, y_pred_train,average=average_option)
    recall = recall_score(y_train, y_pred_train,average=average_option)

    n_support_vectors = model.n_support_

    print(f"For C = {C_value}:")

    print(f"  (a) Error rate: {error_rate:.2%}")
    print(f"  (b) Precision: {precision:.2f}")
    print(f"  (b)  Recall: {recall:.2f}")
    print(f"  (c) Number of Support Vectors for Class 0: {n_support_vectors[0]}")
    print(f"  (c) Number of Support Vectors for Class 1: {n_support_vectors[1]}")

model III
For C = 0.1:
  (a) Error rate: 2.19%
  (b) Precision: 0.98
  (b)  Recall: 0.98
  (c) Number of Support Vectors for Class 0: 65
  (c) Number of Support Vectors for Class 1: 88
For C = 1:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 26
  (c) Number of Support Vectors for Class 1: 48
For C = 10:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 15
  (c) Number of Support Vectors for Class 1: 36
For C = 100:
  (a) Error rate: 0.00%
  (b) Precision: 1.00
  (b)  Recall: 1.00
  (c) Number of Support Vectors for Class 0: 15
  (c) Number of Support Vectors for Class 1: 36


model III got a very littile error rate for C = 0.1 for other C got perfect results on training data. Let's check the validiton data

In [146]:
C_values = [0.1, 1, 10, 100]
results = []

for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='rbf', C=C_value)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    error_rate = 1 - accuracy_score(y_val, y_pred_val)

    precision = precision_score(y_val, y_pred_val,average=average_option)
    recall = recall_score(y_val, y_pred_val,average=average_option)

    beta = 1  # You can adjust the value of beta
    f_beta = (1 + beta**2) * (precision * recall) / (beta**2 * (precision + recall))

    results.append({
        'C_value': C_value,
        'Error Rate': error_rate,
        'Precision': precision,
        'Recall': recall,
        'F1': f_beta })
print("model III")
results_df = pd.DataFrame(results)
print(results_df)

model III
   C_value  Error Rate  Precision  Recall   F1
0      0.1         0.0        1.0     1.0  1.0
1      1.0         0.0        1.0     1.0  1.0
2     10.0         0.0        1.0     1.0  1.0
3    100.0         0.0        1.0     1.0  1.0


model III got perfect results on validation data with all C values

#### Finally, we train the model IV (sigmoid)


In [147]:
print("model IV")
C_values = [0.1, 1, 10, 100]
average_option = 'micro'
# Train and evaluate SVM models for different C values
for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='sigmoid', C=C_value)  # kernel="sigmoid" is the model IV
    model.fit(X_train, y_train)

    # Make predictions on the training set
    y_pred_train = model.predict(X_train)

    # Calculate accuracy on the training set
    error_rate = 1 - accuracy_score(y_train, y_pred_train)

    # (b) Precision and recall
    precision = precision_score(y_train, y_pred_train,average=average_option)
    recall = recall_score(y_train, y_pred_train,average=average_option)

    n_support_vectors = model.n_support_

    print(f"For C = {C_value}:")

    print(f"  (a) Error rate: {error_rate:.2%}")
    print(f"  (b) Precision: {precision:.2f}")
    print(f"  (b)  Recall: {recall:.2f}")
    print(f"  (c) Number of Support Vectors for Class 0: {n_support_vectors[0]}")
    print(f"  (c) Number of Support Vectors for Class 1: {n_support_vectors[1]}")

model IV
For C = 0.1:
  (a) Error rate: 1.25%
  (b) Precision: 0.99
  (b)  Recall: 0.99
  (c) Number of Support Vectors for Class 0: 69
  (c) Number of Support Vectors for Class 1: 69
For C = 1:
  (a) Error rate: 0.94%
  (b) Precision: 0.99
  (b)  Recall: 0.99
  (c) Number of Support Vectors for Class 0: 22
  (c) Number of Support Vectors for Class 1: 25
For C = 10:
  (a) Error rate: 3.44%
  (b) Precision: 0.97
  (b)  Recall: 0.97
  (c) Number of Support Vectors for Class 0: 8
  (c) Number of Support Vectors for Class 1: 11
For C = 100:
  (a) Error rate: 3.75%
  (b) Precision: 0.96
  (b)  Recall: 0.96
  (c) Number of Support Vectors for Class 0: 8
  (c) Number of Support Vectors for Class 1: 11


The model IV got a little error rates for every C value on training data. Now let's check the results in validation data

In [148]:
C_values = [0.1, 1, 10, 100]
results = []

for C_value in C_values:
    # Create and train the SVM model
    model = svm.SVC(kernel='sigmoid', C=C_value)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    error_rate = 1 - accuracy_score(y_val, y_pred_val)

    precision = precision_score(y_val, y_pred_val,average=average_option)
    recall = recall_score(y_val, y_pred_val,average=average_option)

    beta = 1  # You can adjust the value of beta
    f_beta = (1 + beta**2) * (precision * recall) / (beta**2 * (precision + recall))

    results.append({
        'C_value': C_value,
        'Error Rate': error_rate,
        'Precision': precision,
        'Recall': recall,
        'F1': f_beta })
print("model IV")
results_df = pd.DataFrame(results)
print(results_df)

model IV
   C_value  Error Rate  Precision  Recall   F1
0      0.1         0.0        1.0     1.0  1.0
1      1.0         0.0        1.0     1.0  1.0
2     10.0         0.0        1.0     1.0  1.0
3    100.0         0.0        1.0     1.0  1.0


However, the model IV got perfect results on validation data

So, all C values and all model got perfect results on validation data, excepy c=0.1. So, C = 1,10,100 got same results. So, I choose C = 10 for evalution of test data

#### Estimations on test data

In [149]:
C = 10
result = []
models = ['linear', 'poly', 'rbf', 'sigmoid']
i = 0
for kernel in models:
    # Create and train the SVM model
    model = svm.SVC(kernel=kernel, C=C)
    model.fit(X_train, y_train)

    y_pred_test = model.predict(X_test)

    error_rate = 1 - accuracy_score(y_test,y_pred_test)

    precision = precision_score(y_test, y_pred_test,average=average_option)
    recall = recall_score(y_test, y_pred_test,average=average_option)

    beta = 1  # You can adjust the value of beta
    f_beta = (1 + beta**2) * (precision * recall) / (beta**2 * (precision + recall))
    i += 1
    result.append({
        "model": "model"+str(i),
        'C_value': C,
        'Error Rate': error_rate,
        'Precision': precision,
        'Recall': recall,
        'F1': f_beta })

result_df = pd.DataFrame(result)
print(result_df)

    model  C_value  Error Rate  Precision  Recall    F1
0  model1       10        0.00       1.00    1.00  1.00
1  model2       10        0.00       1.00    1.00  1.00
2  model3       10        0.00       1.00    1.00  1.00
3  model4       10        0.05       0.95    0.95  0.95


So, according to the test data results, model I, model II and model III are best, as they have perfect results on test data. model IV has slightly worse results.
Now I need choose one model from 3. As all of them got perfect results, I will choose the one which is simplier. The model I, is uses less amount of support vectors for each class 14 and 6. While model II uses 27 77 support vectors and model III uses 15 36 support vectors. So model I is simplier.

I choose the model I, kernel = linear C = 10