In [None]:
# from google.colab import drive
import numpy as np

# makes printing more human-friendly
np.set_printoptions(precision=3,suppress=True)

In [None]:
# Load the data
colab=True
if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    with open('/content/drive/MyDrive/Colab Notebooks/iris.csv', 'r') as f:
      data = np.genfromtxt(f,delimiter=',')
else:
    with open('iris.csv', 'r') as f:
        data = np.genfromtxt(f,delimiter=',')

X = data[:,:-1]
y = data[:,-1]
print('num_samples, num_features', X.shape)
print('labels', np.unique(y))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
num_samples, num_features (150, 4)
labels [1. 2. 3.]


In [1]:
# 1a) Perform stratified data partition at a 70/30 ratio to yield Xtrain, ytrain (label), Xtest, and test.
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)
print('train: ', X_train.shape)
print('val: ', X_val.shape)
print('test: ', X_test.shape)

NameError: ignored

In [None]:
# 1b) # of neighbors vary 1-9, knn model for each
from sklearn.neighbors import KNeighborsClassifier

models = {}
for i in [1,3,5,10,20,30,40,50,75,100]:
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train, y_train)
  models[i] = knn

In [None]:
# 1b) accuracy
from sklearn.metrics import accuracy_score

best_neighbor = None
highest_acc = 0
for i in models:
    y_predict = models[i].predict(X_test)
    acc = accuracy_score(y_test, y_predict)
    print(f'Classification Rate using {i} neighbors: {acc*100:.2f}%')

    if acc >= highest_acc:
      best_neighbor = i
      highest_acc = acc

Classification Rate using 1 neighbors: 86.96%
Classification Rate using 3 neighbors: 91.30%
Classification Rate using 5 neighbors: 91.30%
Classification Rate using 10 neighbors: 91.30%
Classification Rate using 20 neighbors: 91.30%
Classification Rate using 30 neighbors: 95.65%
Classification Rate using 40 neighbors: 91.30%
Classification Rate using 50 neighbors: 95.65%
Classification Rate using 75 neighbors: 65.22%
Classification Rate using 100 neighbors: 65.22%


In [None]:
# 1c) Best number of neighbors
nneigs = best_neighbor

nneigs

50

In [None]:
# 1d) Train and evaluate final model on X_train and X_val

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Compose trainval dataset
X_trainval, y_trainval = np.concatenate((X_train, X_val), axis=0), np.concatenate((y_train, y_val), axis=0)

# Train kNN model on X_trainval, y_trainval with nneigs neighbors
model = KNeighborsClassifier(n_neighbors=nneigs)
model.fit(X_trainval, y_trainval)

# Predict classes of X_test
y_predict = model.predict(X_test)

# Evaluate accuracy of predictions
acc = accuracy_score(y_test, y_predict)

# Compute confusion matrix of predictions
cm = confusion_matrix(y_test, y_predict)

print(f'\nClassification Rate of {nneigs} neighbors: {acc*100:.2f}%')
print(f'Confusion Matrix of {nneigs} neighbors:')
print(cm)


Classification Rate of 50 neighbors: 95.65%
Confusion Matrix of 50 neighbors:
[[7 0 0]
 [0 7 1]
 [0 0 8]]


In [None]:
import numpy as np

class CustomKNNClassifier:
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]

        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.n_neighbors]

        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

# Example usage similar to (1d) using the custom kNN classifier
# Assuming X_trainval, y_trainval are already defined as in (1d)
nneigs = 5  # Replace with the chosen number of neighbors
custom_knn = CustomKNNClassifier(n_neighbors=nneigs)
custom_knn.fit(X_trainval, y_trainval)

# Predict classes of X_test
y_custom_predict = custom_knn.predict(X_test)

# Evaluate accuracy of predictions using scikit-learn's accuracy_score
from sklearn.metrics import accuracy_score
accuracy_custom_knn = accuracy_score(y_test, y_custom_predict)

print(f'Classification Rate of {nneigs} neighbors (Custom kNN): {accuracy_custom_knn*100:.2f}%')

Classification Rate of 5 neighbors (Custom kNN): 91.30%
