import library

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
np.random.seed(42)
# ==================================================
# CORE LIBRARIES
# ==================================================
import numpy as np
import pandas as pd

# ==================================================
# PREPROCESSING
# ==================================================
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold

# ==================================================
# BALANCING
# ==================================================
from imblearn.over_sampling import SMOTE

# ==================================================
# MODEL
# ==================================================
from sklearn.neighbors import KNeighborsClassifier

# ==================================================
# EVALUATION
# ==================================================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ==================================================
# REPRODUCIBILITY
# ==================================================
np.random.seed(42)

load dataset

In [None]:
data_path = "/content/drive/MyDrive/JST_TB/Dataset1/csv_result-chronic_kidney_disease.csv"
data = pd.read_csv(data_path, on_bad_lines='skip')

data.columns = data.columns.str.strip().str.replace("'", "")
data.head()


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,1,48,80,1.02,1,0,?,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,2,7,50,1.02,4,0,?,normal,notpresent,notpresent,...,38,6000,?,no,no,no,good,no,no,ckd
2,3,62,80,1.01,2,3,normal,normal,notpresent,notpresent,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,4,48,70,1.005,4,0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,5,51,80,1.01,2,0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


ENCODING DATA

In [None]:
categorical_cols = ['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane']

value_mapping = {
    'yes':1,'no':0,
    'present':1,'notpresent':0,
    'normal':1,'abnormal':0,
    'good':1,'poor':0,
    'ckd':1,'notckd':0,
    '?':np.nan
}

data['class'] = data['class'].replace(value_mapping)
for col in categorical_cols:
    data[col] = data[col].replace(value_mapping)

data = data.apply(pd.to_numeric, errors='coerce')
data = data.drop(columns=['id'])

X = data.drop(columns=['class']).values
y = data['class'].values

  data['class'] = data['class'].replace(value_mapping)
  data[col] = data[col].replace(value_mapping)


IMPUTASI MISSING VALUE

In [None]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
np.isnan(X).sum()
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,48.0,80.0,1.02,1.0,0.0,0.809717,1.0,0.0,0.0,121.0,...,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,0.809717,1.0,0.0,0.0,147.864407,...,11.3,38.0,6000.0,4.702247,0.0,0.0,0.0,1.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,9.6,31.0,7500.0,4.702247,0.0,1.0,0.0,0.0,0.0,1.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0


BALANCING DATA (SMOTE)

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

 NORMALISASI

In [None]:
scaler = MinMaxScaler()
X_resampled = scaler.fit_transform(X_resampled)

STRATIFIED K-FOLD CROSS VALIDATION

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_resampled, y_resampled), 1):
    print(f"\n===== Fold {fold} =====")

    knn_model = KNeighborsClassifier(
        n_neighbors=5,
        metric='euclidean'
    )

    knn_model.fit(
        X_resampled[train_idx],
        y_resampled[train_idx]
    )

    y_pred = knn_model.predict(X_resampled[test_idx])

    acc_scores.append(accuracy_score(y_resampled[test_idx], y_pred))
    prec_scores.append(precision_score(y_resampled[test_idx], y_pred))
    rec_scores.append(recall_score(y_resampled[test_idx], y_pred))
    f1_scores.append(f1_score(y_resampled[test_idx], y_pred))


===== Fold 1 =====

===== Fold 2 =====

===== Fold 3 =====

===== Fold 4 =====

===== Fold 5 =====


 FINAL CV RESULTS

In [None]:
print("\n===================================")
print("RATA-RATA AKURASI  :", np.mean(acc_scores))
print("STD AKURASI        :", np.std(acc_scores))
print("PRECISION (AVG)    :", np.mean(prec_scores))
print("RECALL (AVG)       :", np.mean(rec_scores))
print("F1-SCORE (AVG)     :", np.mean(f1_scores))
print("===================================")


RATA-RATA AKURASI  : 0.9838585858585859
STD AKURASI        : 0.00809604623859572
PRECISION (AVG)    : 1.0
RECALL (AVG)       : 0.9676734693877551
F1-SCORE (AVG)     : 0.9835003181627474
