In [361]:
import pandas as pd
import numpy as np
from sys import maxsize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [362]:
# Helper function
def minkowski_distance(x, y, p):
    if p < 1:
        raise ValueError("p must be at least 1")
    return np.power(np.sum(np.power(np.abs(np.subtract(x, y)), p)), (1/p))

In [363]:
class CustomKNeighborsClassifier:
    
    def __init__(self, n_neighbors=2, weights='uniform', metric='minkowski', p=2):
        '''
            weights: ['uniform', 'distance']
            metric: ['manhattan', 'euclidean', 'minkowski']
        '''
        if n_neighbors > 0:
            self.n_neighbors = n_neighbors
        else:
            raise ValueError('Atleast 1 nearest neighbor must be considered')
        if metric in ['manhattan', 'euclidean', 'minkowski']:    
            self.metric = metric
        else:
            raise ValueError('distance_metric can have value of Manhattan, Euclidean or Minkowski only')
        if weights in ['uniform', 'distance']:
            self.weights = weights
        else:
            raise ValueError('weights can have value of uniform or distance only')
        if self.metric == 'manhattan':
            self.p = 1
        elif self.metric == 'euclidean':
            self.p = 2
        else:
            if p < 1:
                raise ValueError('p value cannot be less than 1 for Minkowski distance')
            self.p = p
    
    def check_input(self, X, y=None):
        if not isinstance(X, (pd.Series, pd.DataFrame, np.ndarray)):
            error = """ X required as a Pandas DataFrame, 
                        Pandas Series or a Numpy array. Found type: {datatype_x}
                    """.format(datatype_x=type(X))
            raise TypeError(error)
        if y is not None and not isinstance(y, (pd.Series, pd.DataFrame, np.ndarray)):
            error = """ y required as a Pandas DataFrame, 
                        Pandas Series or a Numpy array. Found type: {datatype_y}
                    """.format(datatype_y=type(y))
            raise TypeError(error)
        return True
    
    def transform_input(self, X, y=None):
        if self.check_input(X, y):
            # Convert X into numpy array
            if isinstance(X, pd.DataFrame):
                X = X.values
            elif isinstance(X, pd.Series):
                X = X.values
                X = X[:, None]
            # Convert y into numpy array
            if y is not None:
                if isinstance(y, pd.DataFrame):
                    y = y.values
                elif isinstance(y, pd.Series):
                    y = y.values
                    y = y[:, None]
        if y is None:
            return X
        else:
            return X, y
    
    def _get_weights(self, dist, ind):
        ''' dist is an array of distances.
            ind is an array of indexes.
            A weight is assigned to each distance
            where weight = 1 / distance.
            In case distance = 0, a vey large positive
            value is returned (sys.maxsize).
        '''
        weights = {}
        for i in range(len(ind)):
            if dist[i] != 0:
                weights[ind[i]] = 1 / dist[i]
            else:
                weights[ind[i]] = maxsize
        return weights
    
    
    def _predict_instance(self, instance):
        target_votes = {}
        k_distances, k_indices = self.kneighbors(instance)
        if self.weights == 'distance':
            weights = self._get_weights(k_distances, k_indices)
        else:
            weights = { k:1 for k in k_indices }
        weights_sum = 0
        for k in weights:
            weights_sum += weights[k]
        for i in range(len(k_indices)):
            response = self.y_train[k_indices[i]][0]
            if response in target_votes:
                target_votes[response] += (weights[k_indices[i]] * 1)
            else:
                target_votes[response] = (weights[k_indices[i]] * 1)
        predicted_class = None
        max_weighted_mode = None
        for target in target_votes:
            weighted_mode = target_votes[target] / weights_sum
            if max_weighted_mode is None or weighted_mode > max_weighted_mode:
                max_weighted_mode = weighted_mode
                predicted_class = target
        return predicted_class
    
    def kneighbors(self, instance, n_neighbors=None):
        '''
            returns (dist, ind)
            for a given unknown instance, where,
            dist: array representing length to k nearest points in training set
            ind: array representing the indices of the k nearest points in training set
        '''
        if n_neighbors == None:
            n_neighbors = self.n_neighbors
        dist_ind = sorted([(minkowski_distance(instance, self.X_train[i], self.p), i)\
                    for i in range(len(self.X_train))])[:n_neighbors]
        dist, ind = map(list, zip(*dist_ind))
        return (dist, ind)
    
    def fit(self, X, y):
        # Nothing to do here. KNN is lazy.
        self.X_train, self.y_train = self.transform_input(X, y)
        return self
        
    def predict(self, X):
        X = self.transform_input(X)
        y_pred = [ self._predict_instance(instance) for instance in X ]
        return np.array(y_pred)

In [364]:
# Load the dataset
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,X,Y,Result
0,2,4,Negative
1,4,2,Negative
2,4,4,Positive
3,4,6,Negative
4,6,2,Positive


In [365]:
# Load some more info about the dataset before proceeding

dataset.info()
dataset.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
X         6 non-null int64
Y         6 non-null int64
Result    6 non-null object
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


Unnamed: 0,X,Y,Result
0,2,4,Negative
1,4,2,Negative
2,4,4,Positive
3,4,6,Negative
4,6,2,Positive
5,6,4,Negative


In [366]:
# Encode the Result column
dataset['Result'] = dataset['Result'].map({'Negative':0, 'Positive':1})
dataset.head(20)

Unnamed: 0,X,Y,Result
0,2,4,0
1,4,2,0
2,4,4,1
3,4,6,0
4,6,2,1
5,6,4,0


In [367]:
# Split into training and testing sets
X = dataset.drop('Result', axis=1)
y = dataset['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [368]:
# Classification using Custom KNN

custom_classifier = CustomKNeighborsClassifier(n_neighbors=3)
y_pred = custom_classifier.fit(X_train, y_train).predict(X_test)
score = accuracy_score(y_pred, y_test)
print("Running custom KNeighborsClassifer class...")
print("Predicted values: {y_pred}".format(y_pred=y_pred))
print("Testing accuracy_score: {score}".format(score=score))

Running custom KNeighborsClassifer class...
Predicted values: [0 0]
Testing accuracy_score: 0.5


In [369]:
# Classification using Sklearn KNN

sklearn_classifier = KNeighborsClassifier(n_neighbors=3)
y_pred = sklearn_classifier.fit(X_train, y_train).predict(X_test)
score = accuracy_score(y_pred, y_test)
print("Running sklearn KNeighborsClassifer class...")
print("Predicted values: {y_pred}".format(y_pred=y_pred))
print("Testing accuracy_score: {score}".format(score=score))

Running sklearn KNeighborsClassifer class...
Predicted values: [0 0]
Testing accuracy_score: 0.5


In [370]:
# Classify the point (6, 6) for k = 3 using locally weighted averaging

# Custom KNN class prediction
X_test = np.array([[6, 6]])
custom_classifier = CustomKNeighborsClassifier(n_neighbors=3)
y_pred = custom_classifier.fit(X_train, y_train).predict(X_test)
print("Custom KNN class predicted value: {y_pred}".format(y_pred=y_pred))

# Sklearn KNN class prediction
sklearn_classifier = KNeighborsClassifier(n_neighbors=3)
y_pred = sklearn_classifier.fit(X_train, y_train).predict(X_test)
print("Sklearn KNN class predicted value: {y_pred}".format(y_pred=y_pred))

Custom KNN class predicted value: [0]
Sklearn KNN class predicted value: [0]


In [371]:
# Classify the point (6, 6) for k = 3 using distance weighted method

# Custom KNN class prediction
X_test = np.array([[6, 6]])
custom_classifier = CustomKNeighborsClassifier(n_neighbors=3, weights='distance')
y_pred = custom_classifier.fit(X_train, y_train).predict(X_test)
print("Custom KNN class predicted value: {y_pred}".format(y_pred=y_pred))

# Sklearn KNN class prediction
sklearn_classifier = KNeighborsClassifier(n_neighbors=3, weights='distance')
y_pred = sklearn_classifier.fit(X_train, y_train).predict(X_test)
print("Sklearn KNN class predicted value: {y_pred}".format(y_pred=y_pred))

Custom KNN class predicted value: [0]
Sklearn KNN class predicted value: [0]
