<a href="https://colab.research.google.com/github/anmolaman20/DataScience_DailyLearning/blob/main/knn_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [151]:
breast_cancer_df = load_breast_cancer()
breast_cancer_df.DESCR



In [152]:
breast_cancer_df.target_names

array(['malignant', 'benign'], dtype='<U9')

In [153]:
breast_cancer_df.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [154]:
X = breast_cancer_df.data
y = breast_cancer_df.target
# X.columns = breast_cancer_df.feature_names

In [155]:
X.shape, y.shape

((569, 30), (569,))

In [156]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [157]:
np.unique(y)

array([0, 1])

In [158]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)


In [159]:
scaler = StandardScaler()

X_train_Scaled = scaler.fit_transform(X_train)
X_test_Scaled = scaler.transform(X_test)

In [160]:
max_score = 0
score_store = []
for i in range(1,20):
  knn = KNeighborsClassifier(n_neighbors=i)

  knn.fit(X_train_Scaled,y_train)

  y_predicted = knn.predict(X_test_Scaled)

  score = accuracy_score(y_test,y_predicted)
  score_store.append(score)

  if score > max_score:
    max_score = score


print(max_score)

0.9473684210526315


In [161]:
# plt.plot(range(1,20),score_store)

In [163]:
# Code From Scratch
from collections import Counter

class Knn:
  def __init__(self,n_neighbors=5):
    self.n_neighbors = n_neighbors
    self.X_train = None
    self.y_train = None

  def train(self,X_train,y_train):
    self.X_train = X_train
    self.y_train = y_train

  def predict(self,X_test):
    y_predicted = []
    for i in X_test:
      distance = []
      yy = []
      for j in self.X_train:
        distance.append(self.cal_distance(i,j))
      n_neighbors_values = sorted(list(enumerate(distance)),key=lambda x : x[1])[:self.n_neighbors]
      # print(n_neighbors_values)

      label = self.majority_element(n_neighbors_values)
      # print(label)
      y_predicted.append(label)
    return y_predicted


  def cal_distance(self,ptA,ptB):
    return np.linalg.norm(ptA-ptB)

  def majority_element(self,n_neighbors_values):
    labels = []
    for index,distance in n_neighbors_values:
      labels.append(y_train[index])

    # print(labels)
    data = Counter(labels)
    return data.most_common(1)[0][0]


knn = Knn()


knn.train(X_train_Scaled,y_train)
y_predicted = knn.predict(X_test_Scaled)

print(accuracy_score(y_test,y_predicted))



0.9473684210526315
