# K-Nearest Neighbor Lab





In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from scipy.io import arff
from scipy.spatial import distance
import math
from scipy.spatial import distance


## 1. (40%) Correctly implement the k-nearest neighbor (KNN) algorithm and the KNN regression algorithm

### Code requirements
- Use Euclidean distance to decide closest neighbors. 
- Include optional distance weighting for both algorithms


In [41]:
class KNNClassifier(BaseEstimator,ClassifierMixin):
    def __init__(self, columntype='categoritcal', weight_type='inverse_distance', k_val=3): ## add parameters here
        """
        Args:
            columntype for each column tells you if continues[real] or if nominal[categoritcal].
            weight_type: inverse_distance voting or if non distance weighting. Options = ["no_weight","inverse_distance"]
        """
        self.columntype = columntype #Note This won't be needed until part 5
        self.weight_type = weight_type
        self.k_val = k_val

    def fit(self, data, labels):
        """ Fit the data; run the algorithm (for this lab really just saves the data :D)
        Args:
            X (array-like): A 2D numpy array with the training data, excluding targets
            y (array-like): A 2D numpy array with the training targets
        Returns:
            self: this allows this to be chained, e.g. model.fit(X,y).predict(X_test)
        """
        self.train_data = data
        self.train_labels = labels
        return self
    
    def predict(self, data):
        """ Predict all classes for a dataset X
        Args:
            X (array-like): A 2D numpy array with the training data, excluding targets
        Returns:
            array, shape (n_samples,)
                Predicted target values per element in X.
        """

        preds = []
        for i in range(len(data)):
            k_nearest = [[math.inf, 0] for x in range(self.k_val)]
            dist_arr = []
            for j in range(len(self.train_data)):
                vals = self.train_data[j]
                label = self.train_labels[j]
                dist = distance.euclidean(vals, data[i])
                dist_arr.append(dist)
                for k in range(self.k_val):
                    if dist < k_nearest[k][0]:
                        k_near_copy = k_nearest.copy()
                        for indx in range(k, self.k_val-1):
                            old_k = k_near_copy[indx]
                            k_nearest[indx+1] = old_k
                        new_k = [dist, label]
                        k_nearest[k] = new_k
                        break
            preds.append(self.predict_output(k_nearest))
        return preds

    def predict_output(self, k_near):
        votes = {}
        # if predict val is real, do regression
        if self.columntype == 'real':
            for i in range(len(k_near)):
                label = k_near[i][1]
                dist =  k_near[i][0]
                if label in votes:
                    if self.weight_type == 'inverse_distance':
                        votes[label] += dist
                    else:
                        votes[label] = dist
                else:
                    if self.weight_type == 'inverse_distance':
                        votes[label] = dist
                    else:
                        votes[label] = dist
            if self.columntype == 'real':
                if self.weight_type != 'inverse_distance':
                    num = 0
                    for key, value in votes.items():
                        num+=key
                    return num/len(votes)
        else:
            for i in range(len(k_near)):
                label = k_near[i][1]
                dist =  k_near[i][0]
                if label in votes:
                    if self.weight_type == 'inverse_distance':
                        inv_dist = 1/dist**2
                        votes[label] += inv_dist
                    else:
                        votes[label] +=1
                else:
                    if self.weight_type == 'inverse_distance':
                        inv_dist = 1/dist**2
                        votes[label] = inv_dist
                    else:
                        votes[label] = 1

            max_val = 0
            label = ''
            for key, value in votes.items():
                if value > max_val or max_val == 0:
                    label = key
                    max_val = value
                # tie
                elif value == max_val:
                    label = min(key, label)
                    max_val = value = votes[label]

        return label

    #Returns the Mean score given input data and labels
    def score(self, X, y):
        """ Return accuracy of model on a given dataset. Must implement own score function.
        Args:
            X (array-like): A 2D numpy array with data, excluding targets
            y (array-like): A 2D numpy array with targets
        Returns:
            score : float
                Mean accuracy of self.predict(X) wrt. y.
        """
        predictions = self.predict(X)
        correct = 0
        for i, pred in enumerate(predictions):
            if pred == y[i]:
                correct +=1
        return correct/len(X)
    

## 1.1 Debug and Evaluation

Debug and Evaluate your model using the parameters below:

- Use distance weighting
- KNN = 3 (three nearest neighbors)
- Don’t normalize the data
- Use Euclidean Distance

---

### 1.1.1 Debug

- Use this [training set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/glass_train.arff) and this [test set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/glass_test.arff)
- Use distance weighting
- KNN = 3 (three nearest neighbors)
- Don’t normalize the data
- Use Euclidean Distance

Expected Results:
- Not using inverse weighted distancing = roughly [68.29%]
- Link to [debug solution](https://github.com/cs472ta/CS472/blob/master/debug_solutions/glass_no_inv_predictions.txt)

- Using inverse weighted distancing = roughly [74.39%]
- Link to [debug solution](https://github.com/cs472ta/CS472/blob/master/debug_solutions/glass_inv_predictions.txt)


In [13]:
def convertBytestoString(df):
  for col in df:
    if isinstance(df[col][0], bytes):
      df[col] = df[col].str.decode("utf8")
  return df

In [34]:
# Load glass data
!curl -s https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/glass_train.arff --output debug.arff
# Train on training set
data = arff.loadarff('debug.arff')
debug_df = convertBytestoString(pd.DataFrame(data[0]))
debug_np = np.array(debug_df)
clf = KNNClassifier(weight_type='no_weight')
train = np.array(debug_np[:,0:-1])
targets = np.array(debug_np[:,-1])
res = clf.fit(train, targets)


# Predict on test set
!curl -s https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/glass_test.arff --output debug_test.arff
debug_test = arff.loadarff('debug_test.arff')
debug_test_df = convertBytestoString(pd.DataFrame(debug_test[0]))
debug_test_np = np.array(debug_test_df)
test_labels = debug_test_np[:,0:-1]
test_targets = debug_test_np[:,-1]
acc = res.score(test_labels, test_targets)
print('acc not using weighted distancing: ', acc)

# using inverse weighted distancing
clf = KNNClassifier(weight_type='inverse_distance')
iw_res = clf.fit(train, targets)
iw_acc = iw_res.score(test_labels, test_targets)
print('acc using weighted distancing: ', iw_acc)



acc not using weighted distancing:  0.6829268292682927
acc using weighted distancing:  0.7439024390243902


### 1.1.2 Evaluate

We will evaluate your model based on its performance on the [diabetes](https://archive.ics.uci.edu/ml/datasets/Diabetes) problem.
- Use this [training set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/diabetes_train.arff) and this [test set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/diabetes_test.arff)

In [None]:
# Load diabetes data

# Train on training set

# Predict on test set


In [37]:
def normalize_vals(inputs):
  xmin = inputs.min(axis=0)
  xmax = inputs.max(axis=0)
  return (inputs-xmin)/(xmax-xmin)

## 2. (10%) Use the k-nearest neighbor algorithm (without distance weighting) for the [magic telescope](http://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope) problem

- Use this [training set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/magic_telescope_train.arff) and this [test set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/magic_telescope_test.arff) 

### 2.1
- Try it with k=3 and without normalization


In [None]:
# Load magic telescope data

# Train/Predict without normalization


### 2.2
- Try it with k=3 and with normalization (input features normalized between 0 and 1). Use the normalization formula (x-xmin)/(xmax-xmin)

In [None]:
# Train/Predict with normalization


*Discuss the accuracy results of using normalized data vs. unnormalized data*

### 2.3

- Using your normalized data, create one graph with classification accuracy on the test set over k values. 
    - Use odd values of k from 1 to 15.
- As a rough sanity check, typical knn accuracies for the magic telescope data set are 75-85%

In [None]:
# Train/Predict with normalization using k=1,3,...,15

# Graph classification accuracy over k


# For the rest of the experiments use only normalized data

## 3. (10%) Use the regression variation of your algorithm (without distance weighting) for the [housing price prediction](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html) problem.

- Use this [training set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/housing_train.arff) and this [test set](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/housing_test.arff).
- Use Mean Square Error (MSE) on the test set as your accuracy metric for this case.
    - Do not normalize regression output values
- Graph MSE on the test set with odd values of k from 1 to 15


In [47]:
# Load housing price prediction data

# Train/Predict using k=1,3,...,15

# Graph MSE over k

!curl -s https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/housing_train.arff --output housing.arff
housing = arff.loadarff('housing.arff')
housing_df = convertBytestoString(pd.DataFrame(housing[0]))
housing_np = np.array(housing_df).astype(float)
clf = KNNClassifier(columntype='real', weight_type='no_weight')
norm_inputs = normalize_vals(housing_np[:,0:-1])
train = np.array(norm_inputs)
targets = np.array(housing_np[:,-1])
res = clf.fit(train, targets)


!curl -s https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/housing_test.arff --output housing_test.arff
housing_test = arff.loadarff('housing_test.arff')
housing_test_df = convertBytestoString(pd.DataFrame(housing_test[0]))
housing_test_np = np.array(housing_test_df).astype(float)
norm_test_inputs = normalize_vals(housing_test_np[:,0:-1])
test_train = np.array(norm_test_inputs)
test_targets = np.array(housing_test_np[:,-1])
acc = res.score(test_labels, test_targets)
print('acc not using weighted distancing: ', acc)


ValueError: operands could not be broadcast together with shapes (13,) (9,) 

## 4. (15%) Repeat your experiments for magic telescope and housing using distance-weighted (inverse of distance squared) voting and discuss your results.


## 4.1 Magic Telescope Dataset

In [None]:
# Train/Predict magic telescope using distance-weighted voting

## 4.2 Housing Dataset

In [None]:
# Train/Predict housing using distance-weighted voting

*Discuss your results*

## 5. (10%) Use the k-nearest neighbor algorithm to solve the [credit-approval](https://archive.ics.uci.edu/ml/datasets/Credit+Approval) (credit-a) problem.

- Use this [dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/credit_approval.arff)
    - Use a 70/30 split of the data for the training/test set
- Note that this set has both continuous and nominal attributes, together with don’t know values. 
- Implement and justify a distance metric which supports continuous, nominal, and don’t know attribute values
    - You need to handle don't knows with the distance metric, not by imputing a value.
    - More information on distance metrics can be found [here](https://www.jair.org/index.php/jair/article/view/10182/24168).
- Use your own choice for k.
- As a rough sanity check, typical knn accuracies for the credit data set are 70-80%.


In [None]:
# Load dataset and split into train/test sets

# Train/Predict credit-approval


*Explain and justify your distance metric*

## 6. (15%) Use the scikit's KNN Classifier on magic telescope and KNN Regressor on housing and compare your results.

- Try out different hyperparameters to see how well you can do. 


In [None]:
# Train/Predict magic telescope using scikit's KNN

# Train/Predict housing using scikit's KNN


*Report your comparison*

## 7. (optional 5% extra credit): For the best value of k for any one of the datasets, implement a reduction algorithm that removes data points in some rational way such that performance does not drop too drastically on the test set given the reduced training set.

- Compare your performance on the test set for the reduced and non-reduced versions and give the number (and percentage) of training examples removed from the original training set. How well does your reduction algorithm work?
    - Note that performance for magic telescope is classification accuracy and for housing it is mean squared error.
    - Magic Telescope has about 12,000 instances and if you use a leave one out style of testing for your data set reduction, then your algorithm will run slow since that is n2 at each step.
    - If you wish, you may use a random subset of 2,000 of the magic telescope instances.
    - More information on reduction techniques can be found [here](http://axon.cs.byu.edu/~martinez/classes/478/slides/IBL.pdf).
