In [91]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

data_set = datasets.load_wine()
data = np.array(data_set["data"])

# Reshaping array so we can add targets as a column to the data-array
# https://stackoverflow.com/questions/21640610/adding-a-new-column-array-to-a-numpy-array
targets = np.array(data_set["target"]).reshape(-1, 1)

final_dataset = np.append(data, targets, axis=1)
# np.random.shuffle(final_dataset)

"""
points is an array containing tuples of type (distance, class) which correspond to the nearest neighbours of some test
datapoint. This function will find the dominant class in the array and return it. In case there are two dominant classes, 
the function will return the first one that occurs within the array.

"""
def find_class(points, classes):
  counts = [0 for x in range(classes)]
  
# point[1] corresponds to the class in the tuple
  for point in points:
    counts[int(point[1])] += 1

  return counts.index(max(counts))


"""
Data is an array containing atleast 2 columns: actual datapoint class and predicted datapoint class. The function
computes the accuracy of the data using the values of these two columns and returns it as a percentage.

"""
def measure_accuracy(data):
  total_correct = 0

  for point in data:
    if (point[-1] == point[-2]):
      total_correct += 1
    
  return format(total_correct / float(len(data)), '.4f')


def normalize_dataset(dataset):
  normalized_dataset = np.copy(dataset)
  
# Calculate min and max of each column
  col_min = normalized_dataset.min(axis = 0)
  col_max = normalized_dataset.max(axis = 0)

  for point in normalized_dataset:
#   Exclude last column as it contains the class of the datapoint
    feature_values = point[:len(point) - 1]

    for index, feature in enumerate(feature_values):
      
      min_val = col_min[index]
      max_val = col_max[index]
      
      new_val = (feature - min_val) / float(max_val - min_val)
      point[index] = new_val

  return normalized_dataset
  
  
def stdev(arr):
  col_means = np.mean(arr, axis=0)
  std_devs = []
  
  for col_idx in range(0, arr.shape[1]):
    col = arr[:, col_idx]
    col_new = [(x - col_means[col_idx])**2 for x in col]
  
    std_devs.append(np.sqrt(np.sum(col_new) / len(col)))

  return std_devs
  

def standardize_dataset(dataset):
  final_dataset = np.copy(dataset)
  
  col_means = np.mean(final_dataset, axis=0)
  col_stds = stdev(final_dataset)
  
  for point in final_dataset:
    feature_values = point[:len(point) - 1]
    
    for index, feature in enumerate(feature_values):
      
      mean = col_means[index]
      std = col_stds[index]
      
      new_val = (feature - mean) / std
      point[index] = new_val
      
  return final_dataset
  
  
"""
data_set: numpy array where columns correspond to feature values, with the last column containing
          the class of the datapoint. Rows correspond to the datapoints.
          
k: How many neighbours to set

classes: Number of classes within the given data_set

training_set_size: if -1, 80% of data_set used for training. Otherwise, the given value is used as the
                   number of traininig datapoints.
                   
"""
  
def knn(data_set, k, training_set_size = -1):
    
  if (training_set_size == -1):
    training_set_size = round(0.8 * len(data_set))
    
  training_data = data_set[:training_set_size]
  test_data =  data_set[training_set_size:]
  
  # Find the total number of classes of the given data_set
  classes =  int(np.max(data_set[:, -1])) + 1

# Add extra column to test data to contain predicted class of that point
  test_data = np.c_[test_data, np.zeros(test_data.shape[0])]  

  for test_point in test_data:
    # Create a list of size equal to k
    neighbours = [-1 for x in range(k)]

    for i, train_point in enumerate(training_data):
      
      dist = np.linalg.norm(test_point[:len(test_point) - 2] - train_point[:len(train_point) - 1])
      point = (dist, train_point[-1])
    
      # If neighbours list is not fully filled, add the computed tuple 'point' to the list
      if (-1 in neighbours):
        for idx, p in enumerate(neighbours): 
          if (p == -1): 
            neighbours[idx] = point
            break
      
      # If list is full, replace any tuple with distance larger than the computed tuple, 'point'
      else:
        for idx, p in enumerate(neighbours):
          if (dist < p[0]):
            neighbours[idx] = point
            break
            
    # Find dominant class of the test point and record it
    test_point[-1] = find_class(neighbours, classes)
    
  return test_data



acc_1 = []
for i in range(1, 11):
      acc_1.append(measure_accuracy(knn(final_dataset, k=i)))
  
print(acc_1)
print()

acc_2 = []
for i in range(1, 11):
      acc_2.append(measure_accuracy(knn(standardize_dataset(final_dataset), k=i)))
  
print (acc_2)
print()

acc_3 = []
for i in range(1, 11):
    acc_3.append(measure_accuracy(knn(normalize_dataset(final_dataset), k=i)))

print (acc_3)

['0.2222', '0.0278', '0.1944', '0.1111', '0.2500', '0.1944', '0.3056', '0.3056', '0.4722', '0.5000']

['0.9722', '0.9444', '0.9722', '0.9722', '0.9722', '0.9722', '0.9722', '0.9167', '0.8889', '0.6111']

['0.9722', '0.9444', '1.0000', '1.0000', '1.0000', '1.0000', '1.0000', '0.9444', '0.9167', '0.6389']
