In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

dataset = pd.read_csv('../files/milk.csv')

# a. Hold-out Method (70%-30%)
train_data, test_data, train_label, test_label = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.3, random_state=100)

# b. K-Fold Cross Validation (k=10)
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# c. Leave-One-Out (LOO)
loo = LeaveOneOut()

print("Hold-out Method (70%-30%)")
print("Train data:", train_data)
print("Train label:", train_data)
print("Test data:", test_data)
print("Test label:", test_label)

print("\nK-Fold Cross Validation (k=10):\n", kf)

print("\nLeave-One-Out (LOO):\n", loo)

Hold-out Method (70%-30%)
Train data:       pH  Temprature  Taste  Odor  Fat   Turbidity  Colour
255  6.8          40      1     0     1          0     245
662  4.7          38      1     0     1          0     255
899  6.6          43      0     0     1          0     250
380  6.6          38      1     0     1          0     255
954  6.6          37      1     0     1          0     255
..   ...         ...    ...   ...   ...        ...     ...
802  6.6          45      0     1     1          1     250
53   8.1          66      1     0     1          1     255
350  6.5          38      1     0     0          0     255
79   6.8          34      0     0     0          1     240
792  3.0          40      1     1     1          1     255

[741 rows x 7 columns]
Train label:       pH  Temprature  Taste  Odor  Fat   Turbidity  Colour
255  6.8          40      1     0     1          0     245
662  4.7          38      1     0     1          0     255
899  6.6          43      0     0     1 

In [4]:
# Normalisasi Data Pelatihan (train_data)
scaler = MinMaxScaler()
train_data_normalized = scaler.fit_transform(train_data)

# Normalisasi Data Pengujian (test_data)
test_data_normalized = scaler.transform(test_data)

# Langkah 6: Menampilkan hasil
print("Train data ternormalisasi:\n", train_data_normalized)
print("\nTest data ternormalisasi:\n", test_data_normalized)

Train data ternormalisasi:
 [[0.58461538 0.10714286 1.         ... 1.         0.         0.33333333]
 [0.26153846 0.07142857 1.         ... 1.         0.         1.        ]
 [0.55384615 0.16071429 0.         ... 1.         0.         0.66666667]
 ...
 [0.53846154 0.07142857 1.         ... 0.         0.         1.        ]
 [0.58461538 0.         0.         ... 0.         1.         0.        ]
 [0.         0.10714286 1.         ... 1.         1.         1.        ]]

Test data ternormalisasi:
 [[0.55384615 0.19642857 0.         ... 0.         1.         0.66666667]
 [0.4        0.28571429 0.         ... 1.         1.         1.        ]
 [0.58461538 0.19642857 0.         ... 1.         1.         1.        ]
 ...
 [0.58461538 0.19642857 0.         ... 0.         1.         1.        ]
 [0.92307692 0.16071429 1.         ... 1.         1.         0.66666667]
 [0.78461538 0.57142857 1.         ... 1.         1.         1.        ]]


In [5]:
# Klasifikasi dengan k-NN (k=7, weights='distance') untuk setiap pendekatan validasi
# Fungsi untuk klasifikasi k-NN dan menghitung akurasi
def k_nn_classification(train_data, train_label, test_data, test_label, k):
    knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
    knn.fit(train_data, train_label)
    predictions = knn.predict(test_data)
    accuracy = accuracy_score(test_label, predictions)
    error_ratio = (1 - accuracy) * 100
    return accuracy, error_ratio

# Klasifikasi dengan k-NN untuk Hold-out Method (70%-30%)
accuracy_holdout, error_ratio_holdout = k_nn_classification(train_data_normalized, train_label, test_data_normalized, test_label, k=3)

# Klasifikasi dengan k-NN untuk K-Fold Cross Validation (k=10)
accuracies_kfold = []
error_ratios_kfold = []
for train_index, test_index in kf.split(train_data_normalized):
    train_fold_data, test_fold_data = train_data_normalized[train_index], train_data_normalized[test_index]
    train_fold_label, test_fold_label = train_label.iloc[train_index], train_label.iloc[test_index]
    accuracy_fold, error_ratio_fold = k_nn_classification(train_fold_data, train_fold_label, test_fold_data, test_fold_label, k=3)
    accuracies_kfold.append(accuracy_fold)
    error_ratios_kfold.append(error_ratio_fold)
mean_accuracy_kfold = np.mean(accuracies_kfold)
mean_error_ratio_kfold = np.mean(error_ratios_kfold)

# Klasifikasi dengan k-NN untuk Leave-One-Out (LOO)
accuracies_loo = []
error_ratios_loo = []
for train_index, test_index in loo.split(train_data_normalized):
    train_loo_data, test_loo_data = train_data_normalized[train_index], train_data_normalized[test_index]
    train_loo_label, test_loo_label = train_label.iloc[train_index], train_label.iloc[test_index]
    accuracy_loo, error_ratio_loo = k_nn_classification(train_loo_data, train_loo_label, test_loo_data, test_loo_label, k=3)
    accuracies_loo.append(accuracy_loo)
    error_ratios_loo.append(error_ratio_loo)
mean_accuracy_loo = np.mean(accuracies_loo)
mean_error_ratio_loo = np.mean(error_ratios_loo)

# Langkah 6: Menampilkan hasil
print("Akurasi menggunakan Hold-out Method (70%-30%):", accuracy_holdout)
print("Error ratio menggunakan Hold-out Method (70%-30%):", error_ratio_holdout, "\n")

print("Rata-rata akurasi menggunakan K-Fold Cross Validation (k=10):", mean_accuracy_kfold)
print("Rata-rata error ratio menggunakan K-Fold Cross Validation (k=10):", mean_error_ratio_kfold, "\n")

print("Rata-rata akurasi menggunakan Leave-One-Out (LOO):", mean_accuracy_loo)
print("Rata-rata error ratio menggunakan Leave-One-Out (LOO):", mean_error_ratio_loo, "\n")

Akurasi menggunakan Hold-out Method (70%-30%): 0.9937106918238994
Error ratio menggunakan Hold-out Method (70%-30%): 0.6289308176100628 

Rata-rata akurasi menggunakan K-Fold Cross Validation (k=10): 0.9986486486486486
Rata-rata error ratio menggunakan K-Fold Cross Validation (k=10): 0.13513513513513487 

Rata-rata akurasi menggunakan Leave-One-Out (LOO): 0.9986504723346828
Rata-rata error ratio menggunakan Leave-One-Out (LOO): 0.1349527665317139 

