<a href="https://colab.research.google.com/github/asrafulasf72/ml_lab/blob/main/221002172_CSE412_222D3_LabReport02_knnFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from collections import Counter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def accuracy_score_custom(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    return correct / len(y_true)

def confusion_matrix_custom(y_true, y_pred):
    unique_classes = np.unique(np.concatenate((y_true, y_pred)))
    matrix = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)
    for i, actual in enumerate(unique_classes):
        for j, predicted in enumerate(unique_classes):
            matrix[i, j] = np.sum((y_true == actual) & (y_pred == predicted))
    return matrix, unique_classes

def precision_score_custom(y_true, y_pred):
    cm, _ = confusion_matrix_custom(y_true, y_pred)
    precision_per_class = []
    for i in range(len(cm)):
        tp = cm[i, i]
        fp = np.sum(cm[:, i]) - tp
        precision_per_class.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    return np.mean(precision_per_class)

def recall_score_custom(y_true, y_pred):
    cm, _ = confusion_matrix_custom(y_true, y_pred)
    recall_per_class = []
    for i in range(len(cm)):
        tp = cm[i, i]
        fn = np.sum(cm[i, :]) - tp
        recall_per_class.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
    return np.mean(recall_per_class)

def f1_score_custom(y_true, y_pred):
    prec = precision_score_custom(y_true, y_pred)
    rec = recall_score_custom(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

In [3]:
class KNN_Custom:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = [self._predict_one(x) for x in X_test]
        return np.array(predictions)

    def _predict_one(self, x):
        distances = [np.sqrt(np.sum((x - x_train)**2)) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_labels).most_common(1)[0][0]
        return most_common

In [4]:
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

In [5]:
news_data = np.array([
    "Sports game tonight",
    "Government election results",
    "New smartphone released",
    "Team wins championship",
    "Prime minister speech",
    "Tech conference 2025",
    "Player scores record",
    "Parliament passes law",
    "AI breakthrough announced"
])
news_labels = np.array([
    "Sports", "Politics", "Technology",
    "Sports", "Politics", "Technology",
    "Sports", "Politics", "Technology"
])


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_news = vectorizer.fit_transform(news_data).toarray()

In [10]:
def evaluate_knn(dataset_name, X, y, k_values, split_ratios):
    print(f"\n=== Dataset: {dataset_name} ===")
    best_score = 0
    best_k = None
    best_split = None

    for split in split_ratios:
        for k in k_values:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split, random_state=42)

            # Custom KNN
            knn_custom = KNN_Custom(k=k)
            knn_custom.fit(X_train, y_train)
            y_pred_custom = knn_custom.predict(X_test)

            acc = accuracy_score_custom(y_test, y_pred_custom)

            if acc > best_score:
                best_score = acc
                best_k = k
                best_split = split

    print(f"Best k: {best_k}, Best Split: {best_split}, Best Accuracy: {best_score:.4f}")
    return X, y, best_k, best_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-best_split, random_state=42)
knn_custom = KNN_Custom(k=best_k)
knn_custom.fit(X_train, y_train)
y_pred_custom = knn_custom.predict(X_test)

print("\nCustom KNN Results:")
print("Accuracy:", accuracy_score_custom(y_test, y_pred_custom))
print("Precision:", precision_score_custom(y_test, y_pred_custom))
print("Recall:", recall_score_custom(y_test, y_pred_custom))
print("F1 Score:", f1_score_custom(y_test, y_pred_custom))
cm, classes = confusion_matrix_custom(y_test, y_pred_custom)
print("Confusion Matrix:\n", cm)


Custom KNN Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[19  0  0]
 [ 0 14  0]
 [ 0  0 13]]


In [15]:
knn_sklearn = KNeighborsClassifier(n_neighbors=best_k)
knn_sklearn.fit(X_train, y_train)
y_pred_sklearn = knn_sklearn.predict(X_test)

print("\nScikit-learn KNN Results:")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("Precision:", precision_score(y_test, y_pred_sklearn, average='macro'))
print("Recall:", recall_score(y_test, y_pred_sklearn, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred_sklearn, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sklearn))


Scikit-learn KNN Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[19  0  0]
 [ 0 14  0]
 [ 0  0 13]]


In [16]:
k_values = range(1, 11)
split_ratios = [0.6, 0.7, 0.8]

evaluate_knn("Iris", X_iris, y_iris, k_values, split_ratios)
evaluate_knn("News", X_news, news_labels, k_values, split_ratios)


=== Dataset: Iris ===
Best k: 1, Best Split: 0.7, Best Accuracy: 1.0000

=== Dataset: News ===
Best k: 3, Best Split: 0.7, Best Accuracy: 0.3333


(array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
         0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]]),
 array(['Sports', 'Politics', 'Technology', 'Sports', 'Politics',
        'Technology', 'Sports', 