# **IDS575: Machine Learning and Statistical Methods**
## [Quiz #01 - k-Nearest Neighborhood (PA)]



## Import Libraries
* See various conventions and acronyms.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the data
*   Verify the Python type for the dataset.

In [2]:
CancerDataset = load_breast_cancer()
print(type(CancerDataset))
print(CancerDataset.keys())

<class 'sklearn.utils.Bunch'>
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


## Verify basic data statistics
* Count the number of features. (i.e., attributes)
* Count the number of examples. (i.e., instances and labels)
* Print out the description of each feature.

In [3]:
def printBasicStats(dataset):
  print(dataset['feature_names'], dataset['target_names'])
  print(len(dataset['feature_names']), type(dataset['feature_names']))  
  print(dataset['data'].shape, dataset['target'].shape)
  print(dataset['DESCR'])

printBasicStats(CancerDataset)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension'] ['malignant' 'benign']
30 <class 'numpy.ndarray'>
(569, 30) (569,)
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture

## Convert the dataset to a DataFrame
*   Not necessarily useful. (scikit-learn works well with default libraries such as list, numpy array, and scipy's sparse matrix)
*   But using pandas provides more intuitive excel or R-like views.

In [4]:
def getDataFrame(dataset):
  numData = dataset['target'].shape[0]
  newDataset = np.concatenate((dataset['data'], dataset['target'].reshape(numData, -1)), axis=1)
  newNames = np.append(dataset['feature_names'], ['target'])
  return pd.DataFrame(newDataset, columns=newNames)

DataFrame = getDataFrame(CancerDataset)
print(DataFrame)
  

     mean radius  mean texture  ...  worst fractal dimension  target
0          17.99         10.38  ...                  0.11890     0.0
1          20.57         17.77  ...                  0.08902     0.0
2          19.69         21.25  ...                  0.08758     0.0
3          11.42         20.38  ...                  0.17300     0.0
4          20.29         14.34  ...                  0.07678     0.0
..           ...           ...  ...                      ...     ...
564        21.56         22.39  ...                  0.07115     0.0
565        20.13         28.25  ...                  0.06637     0.0
566        16.60         28.08  ...                  0.07820     0.0
567        20.60         29.33  ...                  0.12400     0.0
568         7.76         24.54  ...                  0.07039     1.0

[569 rows x 31 columns]


## Inspect label distribution
*   Check the target label distribution/imbalance.


In [5]:
def printLabelDist(df, dataset):
  counts = df.target.value_counts(ascending=True)
  print(counts)
  counts.index = dataset['target_names']
  print(counts)  

printLabelDist(DataFrame, CancerDataset)

0.0    212
1.0    357
Name: target, dtype: int64
malignant    212
benign       357
Name: target, dtype: int64


## Data split
* Split the data into training and test sets.
* No validation for now.



In [6]:
from sklearn.model_selection import train_test_split
def splitData(df, size):
  X, y = df[df.columns[:-1]], df.target
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=size, test_size=X.shape[0] - size, random_state=0)
  return (X_train, y_train), (X_test, y_test)

(X_train, y_train), (X_test, y_test) = splitData(DataFrame, 400)
assert X_train.shape == (400, 30)
assert y_train.shape == (400, )

# Training
*   Train a k-NN model on the training data.
*   Get the training accuracy.



In [7]:
from sklearn.neighbors import KNeighborsClassifier
def trainKnn(X, y, k=1):
  model = KNeighborsClassifier(n_neighbors=k)
  model.fit(X, y)
  pred = model.predict(X)
  accuracy = sum(pred == y) / len(X)    
  return model, accuracy

Model, Acc_train = trainKnn(X_train, y_train, 1)
print(Acc_train)
Model3, Acc_train3 = trainKnn(X_train, y_train, 3)
print(Acc_train3)

1.0
0.9525


# Test
*   Test the model on the test data.
*   Print out the accuracy for different k's.



In [8]:
def testKnn(model, X, y):
  pred = model.predict(X)
  accuracy = sum(pred == y) / len(X)
  return accuracy 
  
testKnn(Model, X_test, y_test)
for k in range(1, 20):
  Model_k, Acc_train = trainKnn(X_train, y_train, k)
  Acc_test = testKnn(Model_k, X_test, y_test)
  print('%d-NN --> training accuracy = %.4f  /  test accuracy = %.4f' % (k, Acc_train, Acc_test))

1-NN --> training accuracy = 1.0000  /  test accuracy = 0.9172
2-NN --> training accuracy = 0.9625  /  test accuracy = 0.8994
3-NN --> training accuracy = 0.9525  /  test accuracy = 0.9172
4-NN --> training accuracy = 0.9475  /  test accuracy = 0.9290
5-NN --> training accuracy = 0.9400  /  test accuracy = 0.9467
6-NN --> training accuracy = 0.9325  /  test accuracy = 0.9349
7-NN --> training accuracy = 0.9375  /  test accuracy = 0.9527
8-NN --> training accuracy = 0.9375  /  test accuracy = 0.9527
9-NN --> training accuracy = 0.9325  /  test accuracy = 0.9586
10-NN --> training accuracy = 0.9325  /  test accuracy = 0.9527
11-NN --> training accuracy = 0.9350  /  test accuracy = 0.9645
12-NN --> training accuracy = 0.9350  /  test accuracy = 0.9645
13-NN --> training accuracy = 0.9300  /  test accuracy = 0.9645
14-NN --> training accuracy = 0.9300  /  test accuracy = 0.9645
15-NN --> training accuracy = 0.9325  /  test accuracy = 0.9645
16-NN --> training accuracy = 0.9300  /  test acc

# Programming Assignment (PA)
*   Implement distance().
*   Implement predict_one().
*   Verify whether myTrainKnn gives the same training accuracies with before.
*   Verify whether myTestKnn gives the same test accuracies with before over increasing k. **(Note that when k is a even number, the resulting accuracy could be different due to different tie-braeaking. It is enough to see matching results for odd k's)**



In [9]:
from collections import Counter
import math
class MyKNeighborsClassifier:
  X_train = None
  y_train = None

  def __init__(self, n_neighbors):
    self.k = n_neighbors

  @staticmethod
  def distance(src, dst):
    ######################################################
    # TO-DO: Return the Euclidean distance.
    distt=0
    for ii in range(len(dst)):
      distt=distt+(src[ii] -dst[ii])**2
    return math.sqrt(distt)
    ######################################################

  def fit(self, X, y):
    # Convert training data to numpy array.
    # There is nothing to do more for kNN as it avoids explicit generalization.
    self.X_train = np.array(X)
    self.y_train = np.array(y)    
    
  ## Predict the label for just one example.
  def predict_one(self, x):
    # Measure the distance to each of training data.
    # Then sort by increasing order of distances.
    distances = []
    for (i, x_train) in enumerate(self.X_train):      
      distances.append([i, self.distance(x, x_train)])      
    distances.sort(key=lambda element: element[1])
    #print(distances[:4])
    ########################################################################
    # TO-DO: Extract the indexes of the examples in the k-Nearest Neighbors.    
    kNN=[]
    if self.k ==1:
      kNN.append(distances[1][0])
    elif self.k>=2:
      for iii in range(self.k):
        kNN.append(distances[iii][0])
    ########################################################################
    # Extract k target values corresponding to the example indexes in kNN.    
    targets = [self.y_train[i] for i in kNN]
    # Return the majority-voted target value.
    return Counter(targets).most_common(1)[0][0]
  
  ## Predict the labels for every example.
  def predict(self, X):    
    predictions = []
    for (i, x) in enumerate(np.array(X)):
      predictions.append(self.predict_one(x))
    return np.asarray(predictions)



In [10]:
def myTrainKnn(X, y, k=1):
  model = MyKNeighborsClassifier(n_neighbors=k)
  model.fit(X, y)
  pred = model.predict(X)
  accuracy = sum(pred == y) / len(X)    
  return model, accuracy

Model, Acc_train = myTrainKnn(X_train, y_train, 1)
print(Acc_train)
Model3, Acc_train3 = myTrainKnn(X_train, y_train, 3)
print(Acc_train3)

0.9075
0.9525


In [11]:
def myTestKnn(model, X, y):
  pred = model.predict(X)
  accuracy = sum(pred == y) / len(X)
  return accuracy 
  
myTestKnn(Model, X_test, y_test)
for k in range(1, 20):
  Model_k, Acc_train = myTrainKnn(X_train, y_train, k)
  Acc_test = myTestKnn(Model_k, X_test, y_test)
  print('%d-NN --> training accuracy = %.4f  /  test accuracy = %.4f' % (k, Acc_train, Acc_test))

1-NN --> training accuracy = 0.9075  /  test accuracy = 0.9112
2-NN --> training accuracy = 1.0000  /  test accuracy = 0.9172
3-NN --> training accuracy = 0.9525  /  test accuracy = 0.9172
4-NN --> training accuracy = 0.9675  /  test accuracy = 0.9290
5-NN --> training accuracy = 0.9400  /  test accuracy = 0.9467
6-NN --> training accuracy = 0.9500  /  test accuracy = 0.9408
7-NN --> training accuracy = 0.9375  /  test accuracy = 0.9527
8-NN --> training accuracy = 0.9425  /  test accuracy = 0.9527
9-NN --> training accuracy = 0.9325  /  test accuracy = 0.9586
10-NN --> training accuracy = 0.9450  /  test accuracy = 0.9527
11-NN --> training accuracy = 0.9350  /  test accuracy = 0.9645
12-NN --> training accuracy = 0.9475  /  test accuracy = 0.9645
13-NN --> training accuracy = 0.9300  /  test accuracy = 0.9645
14-NN --> training accuracy = 0.9375  /  test accuracy = 0.9645
15-NN --> training accuracy = 0.9325  /  test accuracy = 0.9645
16-NN --> training accuracy = 0.9375  /  test acc