In [10]:
import pandas as pd
import numpy as np


# Membaca dataset iris.csv
df = pd.read_csv("../data iris.csv", delimiter=";")

# Menampilkan 5 baris pertama
df.head()



Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Data Cleaning

In [11]:
# Hitung Q1, Q3, dan IQR untuk setiap kolom numerik
Q1 = df.iloc[:, :-1].quantile(0.25)
Q3 = df.iloc[:, :-1].quantile(0.75)
IQR = Q3 - Q1

# Batas bawah dan atas
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Jumlah data sebelum menghapus outlier: {df.shape[0]}")

# Hapus outlier
df = df[~((df.iloc[:, :-1] < lower_bound) | (df.iloc[:, :-1] > upper_bound)).any(axis=1)]

print(f"Jumlah data setelah menghapus outlier: {df.shape[0]}")


Jumlah data sebelum menghapus outlier: 150
Jumlah data setelah menghapus outlier: 146


In [12]:
df.drop_duplicates(inplace=True)
df.shape

(143, 5)

In [13]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

In [14]:

# Mengambil 10 sample data secara acak
sample_data = df.sample(n=10, random_state=42)

print(sample_data)

     sepal_length  sepal_width  petal_length  petal_width            class
123           6.3          2.7           4.9          1.8   Iris-virginica
20            5.4          3.4           1.7          0.2      Iris-setosa
88            5.6          3.0           4.1          1.3  Iris-versicolor
103           6.3          2.9           5.6          1.8   Iris-virginica
62            6.0          2.2           4.0          1.0  Iris-versicolor
12            4.8          3.0           1.4          0.1      Iris-setosa
137           6.4          3.1           5.5          1.8   Iris-virginica
71            6.1          2.8           4.0          1.3  Iris-versicolor
72            6.3          2.5           4.9          1.5  Iris-versicolor
19            5.1          3.8           1.5          0.3      Iris-setosa


In [15]:
X = df.drop(columns="class")
y = df["class"]

y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 143, dtype: object

In [16]:
def train_test_split_manual(X, y, test_size=0.2, random_state=None, shuffle=True):
    """
    Membagi dataset menjadi training set dan test set secara manual.
    
    Parameters:
    X : numpy array atau pandas DataFrame
        Fitur dari dataset.
    y : numpy array atau pandas Series
        Label dari dataset.
    test_size : float
        Proporsi data yang digunakan untuk test set (default 0.2 / 20%).
    random_state : int, optional
        Seed untuk random generator agar hasilnya konsisten.
    shuffle : bool, optional
        Jika True, data akan diacak sebelum dibagi (default: True).
    
    Returns:
    X_train, X_test, y_train, y_test : numpy arrays
        Data yang sudah dibagi menjadi training dan testing set.
    """
    # Konversi X dan y ke numpy array jika masih dalam bentuk DataFrame atau Series
    if random_state:
        np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    test_size = int(len(X) * test_size)
    train_indices, test_indices = indices[test_size:], indices[:test_size]

    # Pastikan X dan y dalam bentuk NumPy array agar indexing bekerja
    X = np.array(X)  # Konversi X ke NumPy array
    y = np.array(y)  # Konversi y ke NumPy array

    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]



# # Contoh penggunaan
# data = np.array([[1, 2], [2, 3], [3, 1], [6, 5], [7, 7], [8, 6], [5, 5], [4, 4]])
# labels = np.array([0, 0, 0, 1, 1, 1, 1, 0])

# X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.2, random_state=42)

# print("X_train:", X_train)
# print("X_test:", X_test)
# print("y_train:", y_train)
# print("y_test:", y_test)


In [17]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.label_map = None
        self.reverse_label_map = None

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)

        # Jika label dalam bentuk string, ubah ke angka
        if isinstance(y_train[0], str):
            unique_labels = np.unique(y_train)
            self.label_map = {label: idx for idx, label in enumerate(unique_labels)}
            self.reverse_label_map = {idx: label for label, idx in self.label_map.items()}
            self.y_train = np.array([self.label_map[label] for label in y_train])
        else:
            self.y_train = np.array(y_train)

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]

        # Jika ada mapping label, kembalikan ke format aslinya (string)
        if self.reverse_label_map:
            return np.array([self.reverse_label_map[pred] for pred in predictions])
        return np.array(predictions)

    def _predict(self, x):
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

    def accuracy(self, X_test, y_test):
        y_pred = self.predict(X_test)

        # Jika y_test berbentuk string, ubah dulu ke angka untuk perbandingan
        if isinstance(y_test[0], str):
            y_test_numeric = np.array([self.label_map[label] for label in y_test])
            y_pred_numeric = np.array([self.label_map[label] for label in y_pred])
            return np.mean(y_pred_numeric == y_test_numeric)

        return np.mean(y_pred == y_test)


In [18]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.2, random_state=42)
    
# Inisialisasi dan latih model
for i in range(1, 30, 2):
    knn = KNN(k=i)
    knn.fit(X_train, y_train)
    # Akurasi
    acc = knn.accuracy(X_test, y_test)
    print(f"K = {i} Akurasi: {acc:.2f}")
    
# # Prediksi
# predictions = knn.predict(X_test)
# print("Prediksi:", predictions)
    
# # Akurasi
# acc = knn.accuracy(X_test, y_test)
# print("Akurasi:", acc)

K = 1 Akurasi: 0.86
K = 3 Akurasi: 0.86
K = 5 Akurasi: 0.89
K = 7 Akurasi: 0.93
K = 9 Akurasi: 0.96
K = 11 Akurasi: 0.96
K = 13 Akurasi: 0.93
K = 15 Akurasi: 0.96
K = 17 Akurasi: 0.96
K = 19 Akurasi: 0.93
K = 21 Akurasi: 0.89
K = 23 Akurasi: 0.89
K = 25 Akurasi: 0.89
K = 27 Akurasi: 0.89
K = 29 Akurasi: 0.86


In [21]:
# Fungsi input manual
def predict_manual(knn_model):
    print("\nMasukkan nilai fitur untuk prediksi:")
    sepal_length = float(input("Sepal Length: "))
    sepal_width = float(input("Sepal Width: "))
    petal_length = float(input("Petal Length: "))
    petal_width = float(input("Petal Width: "))

    input_data = np.array([[sepal_length, sepal_width, petal_length, petal_width]])
    prediction = knn_model.predict(input_data)

    print("\n=== Hasil Prediksi ===")
    print(f"Jenis Iris: {prediction[0]}") 

# Panggil fungsi prediksi manual
predict_manual(knn)



Masukkan nilai fitur untuk prediksi:

=== Hasil Prediksi ===
Jenis Iris: Iris-virginica
