# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
tumor_data = Table.read_table("data.csv").drop('id','Unnamed: 32')
tumor_data

# Let's take a look at ScatterPlots

In [None]:
tumor_labels = list(tumor_data.labels)
tumor_attribute_list = tumor_labels[1:]
tumor_attribute_array = tumor_attribute_list

In [None]:
# Caution, this will crash the kernel due to the volume of outputs. 

for x in tumor_attribute_list:
    tumor_attribute_list.remove(x)
    for y in tumor_attribute_list:
        tumor_data.scatter(x,y,group='diagnosis')

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
tumor_labels = list(tumor_data.labels)
tumor_attribute_list_means_only = tumor_labels[1:11]
tumor_attribute_array_means_only = tumor_attribute_list_means_only

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
for x in tumor_attribute_list_means_only:
    tumor_attribute_list_means_only.remove(x)
    for y in tumor_attribute_list_means_only:
        tumor_data.scatter(x,y,group='diagnosis')

# Steps covered in (5/09) lecture, now defined as functions

In [None]:
# The function below will take a table, and a number as inputs. 
# Then the function will randomly shuffle the rows of the table and take the input number of rows for the training set. 
# The remaining rows will be stored as the test set. 
# The function returns both the training set and the test set. 

def train_test_separation(tbl,num_for_train):
    shuffled_tbl = tbl.sample(with_replacement = False)
    
    train_tbl = shuffled_tbl.take(np.arange(num_for_train))
    test_tbl = shuffled_tbl.take(np.arange(num_for_train,tbl.num_rows))
    
    print("Training set:\t",   train_tbl.num_rows, "examples")
    print("Test set:\t",       test_tbl.num_rows, "examples")
    
    return train_tbl, test_tbl

# The function below will take in two arrays of numbers as inputs. 
# It will then compute the Euclidean distance between the those two arrays as an output. 

def distance(array_one,array_two):
    return (sum((array_one - array_two)**2))**(0.5)

#### The following function below is defined for you homework 12

In [None]:
# row (input): a row from the table 
# features (input): an array of column labels. These labels are the attributes that will help us classify individuals. 
# Note: the attributes must be numerical to help us pass them through the distance function defined above. 

def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

# Let's classify these tumor cells

## Finding the `k` Nearest Neighbors

### Some pre-formatting:

In [None]:
train, test = train_test_separation(tumor_data,450)

In [None]:
row_to_test = test.row(0)
row_to_test

In [None]:
test_features_array = row_to_array(row_to_test,tumor_attribute_array)
test_features_array

### Find the distance between the example (i.e. test row) and each example in the training set

In [None]:
# we will store the distance between the test row with all the rows in the training set. 
distances = make_array()

# we will iterate through the training set row by row.  
for train_row in train.rows:
    #convert the train_row into an array also
    ...
    # compute the distance between the test row array and test row array
    ...
    # save the distance between these two arrays (test row and train row) into the distances array
    ...
    

distances

### Augment the training data table with a column containing all the distances

In [None]:
train_with_distances = ...

### Sort the augmented table in increasing order of the distances

In [None]:
sorted_training = ...

### Take the top `k` rows of the sorted table

In [None]:
top_k_training = ...

## The Classifier

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (visually)

In [None]:
top_k_training.scatter(...,...,group = 'diagnosis')

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (algorithm)

# Evaluating your classifier (Accuracy)