# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Classification - Examination through ScatterPlots

In [None]:
iris =  Table.read_table("IRIS.csv")
iris

In [None]:
#let's pick two of the column labels above and observe the scatter plot. 
# Note that each of the three species will be a different color on the plot to help us differentiate 
# the iris flower species

iris.scatter(... , ... , group = 'species')

# Identifying key variables for classification

In [None]:
#Now let's look at comparing ALL of the labels to each other in pairwise comparisons
#run the code below 

labels_without_species = list(iris.labels)[:-1]
plotted_pairs = []

for x in labels_without_species:
    for y in labels_without_species:
        
        ordered_pair =  {x,y}
        
        if (x != y) and (ordered_pair not in plotted_pairs):
            iris.scatter(x,y,group='species')
            plotted_pairs.append(ordered_pair)
            
print(plotted_pairs)    

# Creating Training and Testing Data Sets

In [None]:
print('The iris data set has',iris.num_rows,'flowers represented in the sameple')

Let's separate 100 rows of data and use it for the training set, and then use the remaining 50 rows of data for the test set. Since the table is organized nicely by species (which is what we are classifying) we want to mix these rows up. We can do this using the `sample` method to shuffle, and then `take` the first 100 rows for training, and the last 50 rows for testing

In [None]:
shuffled_iris = ...

train_iris = ...
test_iris = ...

print("Training set:\t",   train_iris.num_rows, "examples")
print("Test set:\t",       test_iris.num_rows, "examples")
train.show(5), test.show(5);

Let's convert the above process into a single function: 

In [None]:
def train_test_separation(tbl,num_for_train):
    shuffled_tbl = ...
    
    train_tbl = ...
    test_tbl = ...
    
    print("Training set:\t",   train_tbl.num_rows, "examples")
    print("Test set:\t",       test_tbl.num_rows, "examples")
    
    return train_tbl, test_tbl

In [None]:
train_test_separation(iris,100)

# The Distance Formula

In [None]:
((3 - 0)**2 + (4 - 0)**2)**(0.5)

In [None]:
first = make_array(0,0)
second = make_array(3,4)

...

#### The following function below is defined for you homework 12

In [None]:
# row (input): a row from the table 
# features (input): an array of column labels. These labels are the attributes that will help us classify individuals. 
# Note: the attributes must be numerical to help us pass them through the distance function defined above. 

def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

In [None]:
array_of_iris_features = iris.drop('species').labels

first_setosa = row_to_array(iris.row(0),array_of_iris_features)
second_setosa = row_to_array(iris.row(1),array_of_iris_features)
last_virginica = row_to_array(iris.row(-1),array_of_iris_features)

(sum((first_setosa - second_setosa)**2))**(0.5)

Let's convert the above process into a single function:

In [None]:
def distance(array_one,array_two):
    return ...

In [None]:
distance(first_setosa,second_setosa)

In [None]:
distance(first_setosa,last_virginica)

In [None]:
distance(second_setosa,last_virginica)

# Tumor: Let's take a look at ScatterPlots

In [None]:
tumor_data = Table.read_table("data.csv").drop('id','Unnamed: 32')
tumor_data

In [None]:
tumor_labels = list(tumor_data.labels)
tumor_attribute_list = tumor_labels[1:]
tumor_attribute_array = tumor_attribute_list

In [None]:
# Caution, this will crash the kernel due to the volume of outputs. 

for x in tumor_attribute_list:
    tumor_attribute_list.remove(x)
    for y in tumor_attribute_list:
        tumor_data.scatter(x,y,group='diagnosis')

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
tumor_labels = list(tumor_data.labels)
tumor_attribute_list_means_only = tumor_labels[1:11]
tumor_attribute_array_means_only = tumor_attribute_list_means_only

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
for x in tumor_attribute_list_means_only:
    tumor_attribute_list_means_only.remove(x)
    for y in tumor_attribute_list_means_only:
        tumor_data.scatter(x,y,group='diagnosis')

# Let's classify these tumor cells

## Finding the `k` Nearest Neighbors

### Some pre-formatting:

In [None]:
train, test = train_test_separation(tumor_data,450)

In [None]:
row_to_test = test.row(0)
row_to_test

In [None]:
test_features_array = row_to_array(row_to_test,tumor_attribute_array)
test_features_array

### Find the distance between the example (i.e. test row) and each example in the training set

In [None]:
# we will store the distance between the test row with all the rows in the training set. 
distances = make_array()

# we will iterate through the training set row by row.  
for train_row in train.rows:
    #convert the train_row into an array also
    ...
    
    # compute the distance between the test row array and test row array
    ...
    
    # save the distance between these two arrays (test row and train row) into the distances array
    ...
    

distances

### Augment the training data table with a column containing all the distances

In [None]:
train_with_distances = ...

In [None]:
train_with_distances

### Sort the augmented table in increasing order of the distances

In [None]:
sorted_training = ...

### Take the top `k` rows of the sorted table

In [None]:
top_k_training = 

## The Classifier

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (visually)

In [None]:
test.row(0)

In [None]:
top_k_training.scatter(...,...,group = 'diagnosis')

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (algorithm)

In [None]:
decision = ...

# Evaluating your classifier (Accuracy)
What we did for the first row of the `test` table, we must repeat for the rest of the table. 

In [None]:
test

As daunting as that sounds, a for-loop can sufficiently handle this. We repeat the above steps for each row of the test table: 

In [None]:
diagnoses = make_array()

for test_row in test.rows:
    test_row_array = ...
    distances = make_array()
#######
    for train_row in train.rows:
        ...
        
#######
    train_with_distances = train.with_column('distances',distances)
    sorted_training = ...
    top_k_training = ...
    diagnosis = ...
    diagnoses = np.append(diagnoses,diagnosis)
    
len(diagnoses) == test.num_rows 

In [None]:
diagnoses

In [None]:
np.count_nonzero(test.column('diagnosis') == diagnoses)/len(diagnoses)

In [None]:
test.select('diagnosis').with_column('predicted diagnosis',diagnoses)