In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore

In [2]:
df = pd.read_csv('archive/kidney_disease_train.csv')

In [3]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,157,62.0,70.0,1.025,3.0,0.0,normal,abnormal,notpresent,notpresent,...,39.0,7900.0,3.9,yes,yes,no,good,no,no,ckd
1,109,54.0,70.0,,,,,,notpresent,notpresent,...,,,,no,yes,no,good,no,no,ckd
2,17,47.0,80.0,,,,,,notpresent,notpresent,...,,,,yes,no,no,poor,no,no,ckd
3,347,43.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,43.0,7200.0,5.5,no,no,no,good,no,no,notckd
4,24,42.0,100.0,1.015,4.0,0.0,normal,abnormal,notpresent,present,...,39.0,8300.0,4.6,yes,no,no,poor,no,no,ckd


In [4]:
# Map Data untuk tipe Nominal dan Ordinal
string_cols = df.select_dtypes(include='object').columns

label_mapping = {}

for col in string_cols:
    unique_values = df[col].unique()
    label_mapping[col] = {val: i for i, val in enumerate(unique_values)}
    df[col] = df[col].map(label_mapping[col])
    
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,157,62.0,70.0,1.025,3.0,0.0,0,0,0,0,...,39.0,0,0,0,0,0,0,0,0,0
1,109,54.0,70.0,,,,1,1,0,0,...,,1,1,1,0,0,0,0,0,0
2,17,47.0,80.0,,,,1,1,0,0,...,,1,1,0,1,0,1,0,0,0
3,347,43.0,60.0,1.025,0.0,0.0,0,2,0,0,...,43.0,2,2,1,1,0,0,0,0,1
4,24,42.0,100.0,1.015,4.0,0.0,0,0,0,1,...,39.0,3,3,0,1,0,1,0,0,0


In [5]:
# Exclude kolom yang tidak akan di preprocess
columns_to_exclude = ['id', 'htn', 'dm', 'cad', 'ane', 'classification']

fitur = df.drop(columns=columns_to_exclude).columns

# Tahapan preprocessing menggunakan Scikit-Learn
preprocessor = ColumnTransformer(
    transformers=[
        ('impute', SimpleImputer(strategy='mean'), fitur)
    ])

# Apply tahapan preprocessing ke semua DataFrame
df_processed_awal = preprocessor.fit_transform(df[fitur])

df_processed = pd.DataFrame(df_processed_awal, columns=fitur)

df_processed.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,appet,pe
0,62.0,70.0,1.025,3.0,0.0,0.0,0.0,0.0,0.0,122.0,42.0,1.7,136.0,4.7,12.6,39.0,0.0,0.0,0.0,0.0
1,54.0,70.0,1.017439,1.02449,0.438017,1.0,1.0,0.0,0.0,233.0,50.1,1.9,137.255869,4.754245,11.7,39.165939,1.0,1.0,0.0,0.0
2,47.0,80.0,1.017439,1.02449,0.438017,1.0,1.0,0.0,0.0,114.0,87.0,5.2,139.0,3.7,12.1,39.165939,1.0,1.0,1.0,0.0
3,43.0,60.0,1.025,0.0,0.0,0.0,2.0,0.0,0.0,108.0,25.0,1.0,144.0,5.0,17.8,43.0,2.0,2.0,0.0,0.0
4,42.0,100.0,1.015,4.0,0.0,0.0,0.0,0.0,1.0,150.186235,50.0,1.4,129.0,4.0,11.1,39.0,3.0,3.0,1.0,0.0


In [6]:
columns_to_exclude = ['id', 'htn', 'dm', 'cad', 'ane', 'classification']

# Pemilihan fitur numerik
fitur_numerik = df.select_dtypes(include=['float64']).columns
fitur_cek = [col for col in fitur_numerik if col not in columns_to_exclude]

# Kalkulasi Z-Scores
z_scores = np.abs(zscore(df[fitur_cek]))

# Tetapkan threshold untuk pengecekan outliers
outlier_threshold = 3
outliers = (z_scores > outlier_threshold).any(axis=1)

# Hitung jumlah data yang dihapus
num_removed_data_points = outliers.sum()

# Jumlah data yang dihapus
df_no_outliers = df[~outliers]

print(f"Jumlah outliers yang dihapus: {num_removed_data_points}")

Jumlah outliers yang dihapus: 0


In [7]:
# Scaling Min-Max
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_processed), columns=fitur)

# Masukan kembali data yang telah di drop sebelumnya
df_scaled_last = pd.concat([df_scaled, df[columns_to_exclude].reset_index(drop=True)], axis=1)
df_scaled_last = df_scaled_last.drop('id', axis=1)

df_scaled_last.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,appet,pe,htn,dm,cad,ane,classification
0,0.681818,0.153846,1.0,0.6,0.0,0.0,0.0,0.0,0.0,0.12381,...,0.666667,0.0,0.0,0.0,0.0,0,0,0,0,0
1,0.590909,0.153846,0.621926,0.204898,0.087603,0.5,0.5,0.0,0.0,0.388095,...,0.670354,0.012821,0.020833,0.0,0.0,1,0,0,0,0
2,0.511364,0.230769,0.621926,0.204898,0.087603,0.5,0.5,0.0,0.0,0.104762,...,0.670354,0.012821,0.020833,1.0,0.0,0,1,0,0,0
3,0.465909,0.076923,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.090476,...,0.755556,0.025641,0.041667,0.0,0.0,1,1,0,0,1
4,0.454545,0.384615,0.5,0.8,0.0,0.0,0.0,0.0,0.5,0.19092,...,0.666667,0.038462,0.0625,1.0,0.0,0,1,0,0,0


In [8]:
# Rumus Euclidean Distance
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))

# Algoritma KSVNN
class KSVNN:
    def __init__(self, k, threshold):
        self.k = k
        self.threshold = threshold
        self.support_vectors = []

    def fit(self, X, y):
        n_samples = len(X)
        self.X_train = X
        self.y_train = y
        significance_degrees = np.zeros(n_samples)

        # Menghitung SD atau Significant Degree untuk masing-masing data
        for i in range(n_samples):
            distances = [euclidean_distance(X[i], X[j]) for j in range(n_samples) if i != j]
            sorted_indices = np.argsort(distances)[:self.k]

            # Menyesuaikan LV dan RV
            lv = sum(y[sorted_indices] == y[i])
            rv = self.k - lv
            significance_degrees[i] = lv / self.k if self.k != 0 else 0

            # Pemilihan Support Vectors
            if significance_degrees[i] >= self.threshold:
                self.support_vectors.append((X[i], y[i]))

    def predict(self, X):
        predictions = []
        for row in X:
            # Temukan Support Vectors terdekat
            nearest_support_vector = min(self.support_vectors, key=lambda sv: euclidean_distance(row, sv[0]))
            predictions.append(nearest_support_vector[1])
        return np.array(predictions)

In [9]:
from sklearn.feature_selection import VarianceThreshold

target_variable = "classification"
threshold_value = 0.13

# SKLearn - Variance Threshold
def select_features_variance_threshold(df, target_variable, threshold=threshold_value):
    # Pisahkan fitur dengan label
    features = df.drop(target_variable, axis=1)
    
    # Start VarianceThreshold menggunakan threshold yang telah ditentukan
    selector = VarianceThreshold(threshold=threshold)
    
    # Fit dan transform
    selected_features = selector.fit_transform(features)
    
    selected_feature_names = features.columns[selector.get_support()].tolist()
    
    # Print variances of the selected features
    variances = selector.variances_
    print("Variance dari seleksi fitur:")
    for feature, variance in zip(selected_feature_names, variances):
        print(f"{feature}: {variance}")
    
    return selected_feature_names

# Ambil fitur dengan statistik diatas threshold
selected_features_variance_threshold = select_features_variance_threshold(df_scaled_last, target_variable, threshold=threshold_value)

df_selected_variance_threshold = df_scaled_last[selected_features_variance_threshold]

print("\nFitur yang terseleksi menggunakan variance threshold:")
print(selected_features_variance_threshold)

Variance dari seleksi fitur:
pc: 0.03859401832671462
appet: 0.011596634985791953
pe: 0.07164355240046771
htn: 0.0659790087463557
dm: 0.039652892561983476
cad: 0.11229272959183673

Fitur yang terseleksi menggunakan variance threshold:
['pc', 'appet', 'pe', 'htn', 'dm', 'cad']


In [10]:
from sklearn.model_selection import train_test_split

# Pemilihan fitur
selected_features = selected_features_variance_threshold

# Hanya pilih beberapa fitur relevan
selected_data = df_scaled_last[selected_features + ['classification']]

# Pisahkan fitur dan label
features = selected_data.drop(columns=['classification']).to_numpy()
labels = selected_data['classification'].to_numpy()

# Potong data menjadi train dan test
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=random_state)

# Buatkan DataFrame-nya
train_data = pd.DataFrame(X_train, columns=selected_features)
train_data['classification'] = y_train

test_data = pd.DataFrame(X_test, columns=selected_features)
test_data['classification'] = y_test

num_train_data = len(train_data)

train_data = train_data.sample(frac=1).reset_index(drop=True)

print("\nData yang dipakai untuk training: {}".format(num_train_data))

print("\nTraining Data:")
train_data.head()


Data yang dipakai untuk training: 224

Training Data:


Unnamed: 0,pc,appet,pe,htn,dm,cad,classification
0,0.5,1.0,1.0,0.0,0.0,0.0,0
1,0.5,0.0,1.0,0.0,0.0,0.0,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0
3,1.0,0.0,0.0,1.0,1.0,0.0,0
4,1.0,0.0,0.0,1.0,1.0,1.0,0


In [11]:
# Use the selected features from Variance Threshold
selected_features = selected_features_variance_threshold

# Bagi data menjadi fitur dan label
features = train_data[selected_features].to_numpy()
labels = train_data['classification'].to_numpy()

# Training model
ksvnn = KSVNN(k=3, threshold=0.5)
ksvnn.fit(features, labels)

# Persiapan untuk Data Testing
test_features = test_data[selected_features].to_numpy()
test_labels = test_data['classification'].to_numpy()

# Prediksi untuk Data Testing
predictions = ksvnn.predict(test_features)

# DataFrame untuk Data Testing
test_data_with_predictions = test_data.copy()
test_data_with_predictions['prediksi_kelas'] = predictions

# Print Support Vectors
print("\nJumlah Support Vectors:", len(ksvnn.support_vectors))

# Print DataFrame without 'classification' column
print("\nHasil Testing:")
test_data_with_predictions.drop('classification', axis=1).head()


Jumlah Support Vectors: 192

Hasil Testing:


Unnamed: 0,pc,appet,pe,htn,dm,cad,prediksi_kelas
0,1.0,1.0,1.0,1.0,1.0,0.0,0
1,1.0,1.0,0.0,1.0,0.0,0.0,0
2,1.0,0.0,0.0,0.0,1.0,0.0,0
3,1.0,0.0,0.0,1.0,1.0,0.0,1
4,0.0,1.0,1.0,1.0,1.0,0.0,0


In [12]:
from sklearn.metrics import classification_report, accuracy_score

classification_report_result = classification_report(test_labels, predictions)

accuracy_percentage = accuracy_score(test_labels, predictions) * 100

print("Classification Report:\n", classification_report_result)

print("Persentase akurasi model: {:.2f}%".format(accuracy_percentage))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95        29
           1       0.90      1.00      0.95        27

    accuracy                           0.95        56
   macro avg       0.95      0.95      0.95        56
weighted avg       0.95      0.95      0.95        56

Persentase akurasi model: 94.64%
