In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

import numpy as np
import scipy.stats as sp
import sklearn.preprocessing as skp

import math
from collections import namedtuple
from functools import partial

In [None]:
data = pd.read_csv("chips.txt", header = None)

X = data.values[:,:2]
Y = data.values[:,2]

np.random.seed(0)
random_permutations = [np.random.permutation(len(X)) for _ in range(0,3)]

In [None]:
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])

Params = namedtuple("Params", "k norm trans kernel")

In [None]:
step = 0.01
x_min, y_min = np.amin(X, axis = 0) - step
x_max, y_max = np.amax(X, axis = 0) + step

In [None]:
def estimate(X, Y, point, params):
    vkernel = np.vectorize(params.kernel)
    
    distances = params.norm(X - point, axis = -1)
    dist_arg_sorted = np.argsort(distances)
    norm_distance = distances[dist_arg_sorted[params.k + 1]]
    
    ys = np.unique(Y)
    scores = [np.sum(vkernel( distances / norm_distance) * (Y == y)) for y in ys]
    
    return ys[np.argmax(scores)]

In [None]:
def cross_validate(X, Y, estimate, block_n, permutations):
    accuracy = 0
    for permutation in permutations:
        x_parts = np.array_split(X[permutation], block_n)
        y_parts = np.array_split(Y[permutation], block_n)
        for i in range(block_n):
            x_train = np.concatenate(np.delete(x_parts, i, 0))
            y_train = np.concatenate(np.delete(y_parts, i, 0))
            x_test = x_parts[i]
            y_test = y_parts[i]
            test_accuracy = 0
            for x, y in zip(x_test, y_test):
                test_accuracy += (estimate(x_train, y_train, x) == y)
            accuracy += test_accuracy / len(x_test)
    return accuracy / block_n / len(permutations)

In [None]:
def identity(x):
    return x

def polar(x):
    return np.column_stack([
        np.sqrt(x[:,0] ** 2 + x[:,1] ** 2),
        np.arctan2(x[:,1], x[:,0])
    ])

def withPolar(x):
    return np.concatenate([x, polar(x)], axis = 1)

def kernelConst(x):
    return 0.5 if abs(x) < 1.0 else 0.0

def kernelTriangle(x):
    return 1.0 - abs(x) if abs(x) < 1.0 else 0.0

def kernelEpanechnikov(x):
    return 0.75 * (1.0 - x ** 2) if abs(x) < 1.0 else 0.0

params_set = [
    Params(k, partial(np.linalg.norm, ord = norm_ord), trans_f, kernel_f)
    for k in [1,2,3,4,5,6,8,10,12,15,18,21]
    for norm_ord in [
#        -np.inf, 
#         1, 
#        1.5, 
        2, 
        np.inf
    ]
    for trans_f in [
        identity,
        skp.PolynomialFeatures(2, include_bias = False).fit_transform,
#         polar,
#         withPolar
    ]
    for kernel_f in [
        kernelConst,
#         kernelTriangle,
        kernelEpanechnikov
    ]
]

In [None]:
params_accuracies = [cross_validate(p.trans(X), Y, partial(estimate, params = p), 5, random_permutations)
                     for p in params_set] 

In [None]:
best_params = params_set[np.argmax(params_accuracies)]
print(cross_validate(best_params.trans(X), Y, partial(estimate, params = best_params), 5, random_permutations))
print(best_params)

In [None]:
xx, yy = np.meshgrid(np.arange(x_min, x_max, step),
                     np.arange(y_min, y_max, step))
transX = best_params.trans(X)
transPoints = best_params.trans(np.column_stack([xx.reshape((-1)), yy.reshape((-1))]))
zz = np.array([estimate(transX, Y, p, best_params)
               for p in transPoints]
             ).reshape(xx.shape)

In [None]:
plt.figure(figsize=(15, 15))
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

x0, y0 = X[Y == 0].T
x1, y1 = X[Y == 1].T

plt.pcolormesh(xx, yy, zz, cmap = cmap_light)
plt.scatter(x0, y0, marker = 'x', color = 'red',  s = 70)
plt.scatter(x1, y1, marker = 'o', color = 'blue', s = 50)