# Implementing a From-Scratch K-Nearest Neighbors Algorithm for Binary Classification


### The task of the project is to predict whether a customer will subscribe to a term deposit or not
### The data is taken from the following source: https://archive.ics.uci.edu/dataset/222/bank+marketing
### The dataset that we will use contains 20 variables and 4119 samples

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv ('bank-additional.csv', delimiter= ';')

### Check for missing values

In [9]:
df.isnull().any()

age               False
job               False
marital           False
education         False
default           False
housing           False
loan              False
contact           False
month             False
day_of_week       False
duration          False
campaign          False
pdays             False
previous          False
poutcome          False
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
y                 False
dtype: bool

### Split dataset into input and target data

In [10]:
X = df.iloc[: , :15]
y = df.iloc[: , -1]

### One-hot Encoding for Categorical Variables

In [11]:
X = pd.get_dummies(X)

### Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state = 1)

## Implement K-Nearest Neighbors From Scratch

In [44]:
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
import numpy as np

class knn_fs(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors=1, metric='euclidean'):
        self.n_neighbors = n_neighbors
        self.metric = metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def get_dist(self, a, b):
        if self.metric == 'euclidean':
            dist = np.linalg.norm(np.array(a) - np.array(b))
        elif self.metric == 'manhattan':
            dist = np.sum(np.abs(np.array(a) - np.array(b)))
        else:
            raise ValueError('Please input a valid metric type')
        return dist

    def get_n_nearest(self, example):
        distances = []
        for i in range(len(self.X_train)):
            dist = self.get_dist(example, self.X_train[i])
            distances.append((i, dist))
        distances.sort(key=lambda x: x[1])
        return distances[1:self.n_neighbors]  # Get the top n neighbors

    def predict(self, X_test):
        predictions = []
        for i in range(len(X_test)):
            neighbors = self.get_n_nearest(X_test[i])
            classes = [self.y_train[pair[0]] for pair in neighbors]
            prediction = Counter(classes).most_common(1)[0][0]
            predictions.append(prediction)
        return predictions

### Import Built-in k-NN from Scikit-Learn for Comparison

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
knn = KNeighborsClassifier()

### Find Best Parameters

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
param_grid = {'n_neighbors': list (range(1, 20)), 'metric': ['euclidean', 'manhattan']}

In [21]:
grid = GridSearchCV (knn, param_grid, cv=5, scoring='accuracy')

In [22]:
grid.fit (X_train, y_train)

In [23]:
grid.best_params_

{'metric': 'manhattan', 'n_neighbors': 16}

### Cross Validate Best Model

In [25]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [26]:
best_model = grid.best_estimator_

In [35]:
scores = cross_val_score (best_model, X_test, y_test, cv=4, scoring='f1_micro')

In [36]:
scores.mean()

0.912621359223301

### Cross Validate "From Scratch" Model Using Above Parameters

In [49]:
knn_2 = knn_fs(16, 'manhattan')


In [50]:
scores_fs = cross_val_score (knn_2, X_test.values, y_test.values, cv=4, scoring='f1_micro')
scores_fs.mean()

0.912621359223301

## Conclusion: Absolutely No Difference in Micro Averaged f1-scores