# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Looking at Distance with Spotify Data from Kaggle 
(link to Kaggle: [https://www.kaggle.com/datasets/multispiros/34740-hit-and-nonhit-songs-spotify-features/data?select=complete.csv](https://www.kaggle.com/datasets/multispiros/34740-hit-and-nonhit-songs-spotify-features/data?select=complete.csv))

In [None]:
spotify =  Table.read_table("spotify_complete.csv")
spotify

In [None]:
#let's pick two of the column labels above and observe the scatter plot. 
# Note that the points are color-coordinated based upon whether they are considered a hit song or not. 
# 'On_chart = 1' means the song is considered a hit song

spotify.scatter('danceability' , 'tempo' , group = 'On_chart')

# Identifying key variables for classification

In [None]:
spotify.show(3)

In [None]:
#Now let's look at comparing ALL of the labels to each other in pairwise comparisons
#run the code below 

labels_for_spotify_songs = [
    #'track_title',
    # 'artist_name',
    # 'track_id',
    # 'duration_ms',
    # 'energy',
    # 'key',
    # 'mode',
    # 'time_signature',
     'acousticness',
     'danceability',
     'instrumentalness',
     'liveness',
     'loudness',
     'speechiness',
     'valence',
     'tempo']
#'On_chart' is purposely left out since we don't want it to be graphed on an axis and color our points instead

plotted_pairs = []

for x in labels_for_spotify_songs:
    for y in labels_for_spotify_songs:
        
        ordered_pair =  {x,y}
        
        if (x != y) and (ordered_pair not in plotted_pairs):
            spotify.scatter(x,y,group='On_chart')
            plotted_pairs.append(ordered_pair)
            
print(plotted_pairs)    

# The Distance Formula

In [None]:
((3 - 0)**2 + (4 - 0)**2)**(0.5)

In [None]:
first = make_array(0,0,0,0)
second = make_array(3,4,12,25)

(sum((first - second)**2))**(1/2)

#### The following function below is defined for you homework 12

In [None]:
# row (input): a row from the table 
# features (input): an array of column labels. These labels are the attributes that will help us classify individuals. 
# Note: the attributes must be numerical to help us pass them through the distance function defined above. 

def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

In [None]:
spotify.take(make_array(0,1,-1))

In [None]:
array_of_spotify_features = labels_for_spotify_songs

first_song = row_to_array(spotify.row(0),array_of_spotify_features)
second_song = row_to_array(spotify.row(1),array_of_spotify_features)
last_song = row_to_array(spotify.row(-1),array_of_spotify_features)

(sum((first_song - last_song)**2))**(0.5)

Let's convert the above process into a single function:

In [None]:
def distance(array_one,array_two):
    return (sum((array_one - array_two)**2))**(0.5)

In [None]:
distance(first_song,second_song)

In [None]:
distance(first_song,last_song)

In [None]:
distance(second_song,last_song)

# Activity!
## Building a Song/Playlist Recommendation based on Distance

#### Use this cell to find a song that you like, make sure that the output is a row object. Feel free to use methods like `group` and `where` to help identify your song

In [None]:
...
spotify#.row(0)

#### Use the cell below to help define the features that you find most relevant to find songs of "closest distance"

In [None]:
labels_for_spotify_songs = [
    'track_title',
     'artist_name',
     'track_id',
     'duration_ms',
     'energy',
     'key',
     'mode',
     'time_signature',
     'acousticness',
     'danceability',
     'instrumentalness',
     'liveness',
     'loudness',
     'speechiness',
     'valence',
     'tempo',
     'On_chart']


fav_song = ... #copy the last line from the above code cell

fav_song_array = row_to_array(fav_song,labels_for_spotify_songs)
fav_song_array

In [None]:
# free cell (I was thinking you could use this cell to remove any song/artist you don't want recommended)
...

#### Use the cell below to write a for-loop that calculates the distance between your favorite song and every other song in the dataset.

In [None]:
distances_to_fav_song = make_array()

for each_song_row in spotify.rows:
    song_row_array = ...
    song_row_distance = ...  
    distances_to_fav_song = np.append(distances_to_fav_song,song_row_distance)
    
spotify_with_distance = spotify.with_column('Distance',distances_to_fav_song)
spotify_with_distance

In [None]:
#extra cells for further examination


## Question: 
### What was the song you chose, and what recommendation do you have based upon the song? Do you agree with this recommendation?? Why or why not?

*__Type your answer here replacing this text__*

# Tumor: Let's take a look at ScatterPlots

In [None]:
tumor_data = Table.read_table("data_tumor.csv").drop('id','Unnamed: 32')
tumor_data

In [None]:
tumor_labels = list(tumor_data.labels)
tumor_attribute_list = tumor_labels[1:]
tumor_attribute_array = tumor_attribute_list

In [None]:
# Caution, this will crash the kernel due to the volume of outputs. 

for x in tumor_attribute_list:
    tumor_attribute_list.remove(x)
    for y in tumor_attribute_list:
        #tumor_data.scatter(x,y,group='diagnosis')

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
tumor_labels = list(tumor_data.labels)
tumor_attribute_list_means_only = tumor_labels[1:11]
tumor_attribute_array_means_only = tumor_attribute_list_means_only

In [None]:
# This reduces the list of attributes to examine the means only, to avoid crashing the kernel. 
for x in tumor_attribute_list_means_only:
    tumor_attribute_list_means_only.remove(x)
    for y in tumor_attribute_list_means_only:
        tumor_data.scatter(x,y,group='diagnosis')

# Let's classify these tumor cells

## Finding the `k` Nearest Neighbors

### Some pre-formatting:

In [None]:
def train_test_separation(tbl,num_for_train):
    shuffled_tbl = tbl.sample(with_replacement = False)
    
    train_tbl = shuffled_tbl.take(np.arange(num_for_train))
    test_tbl = shuffled_tbl.take(np.arange(num_for_train,tbl.num_rows))
    
    print("Training set:\t",   train_tbl.num_rows, "examples")
    print("Test set:\t",       test_tbl.num_rows, "examples")
    
    return train_tbl, test_tbl

In [None]:
train, test = train_test_separation(tumor_data,450)

In [None]:
row_to_test = test.row(0)
row_to_test

In [None]:
test_features_array = row_to_array(row_to_test,tumor_attribute_array)
test_features_array

### Find the distance between the example (i.e. test row) and each example in the training set

In [None]:
# we will store the distance between the test row with all the rows in the training set. 
distances = make_array()

# we will iterate through the training set row by row.  
for train_row in train.rows:
    #convert the train_row into an array also
    train_row_array = row_to_array(train_row,tumor_attribute_array)
    
    # compute the distance between the test row array and test row array
    train_row_array_distance = distance(test_features_array,train_row_array)
    
    # save the distance between these two arrays (test row and train row) into the distances array
    distances = np.append(distances,train_row_array_distance)
    

distances

### Augment the training data table with a column containing all the distances

In [None]:
train_with_distances = train.with_column('distances',distances)

In [None]:
train_with_distances

### Sort the augmented table in increasing order of the distances

In [None]:
sorted_training = train_with_distances.sort('distances')

In [None]:
sorted_training.select('diagnosis','distances')

### Take the top `k` rows of the sorted table

In [None]:
top_k_training = sorted_training.take(np.arange(47))

In [None]:
top_k_training

## The Classifier

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (visually)

In [None]:
row_to_test

In [None]:
x_axis_label = "radius_mean"
y_axis_label = "texture_mean"

top_k_training.scatter(x_axis_label, y_axis_label, group = "diagnosis")
plt.scatter(row_to_test.item(x_axis_label), row_to_test.item(y_axis_label), marker = 'X', s = 100)

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (algorithm)

In [None]:
top_k_training.group('diagnosis')

In [None]:
decision = top_k_training.group('diagnosis').sort('count',descending = True).column(0).item(0)
decision

# Evaluating your classifier (Accuracy)
What we did for the first row of the `test` table, we must repeat for the rest of the table. 

In [None]:
test

As daunting as that sounds, a for-loop can sufficiently handle this. We repeat the above steps for each row of the test table: 

In [None]:
diagnoses = make_array()

for test_row in test.rows:
    test_row_array = row_to_array(test_row,tumor_attribute_array)
    distances = make_array()
#######
    for train_row in train.rows:
        #convert the train_row into an array also
        train_row_array = row_to_array(train_row,tumor_attribute_array)

        # compute the distance between the test row array and test row array
        train_row_array_distance = distance(test_row_array,train_row_array)

        # save the distance between these two arrays (test row and train row) into the distances array
        distances = np.append(distances,train_row_array_distance)
#######
    train_with_distances = train.with_column('distances',distances)
    sorted_training = train_with_distances.sort('distances')
    top_k_training = sorted_training.take(np.arange(47))
    diagnosis = top_k_training.group('diagnosis').sort('count',descending = True).column(0).item(0)
    diagnoses = np.append(diagnoses,diagnosis)
    
len(diagnoses) == test.num_rows 

In [None]:
diagnoses

In [None]:
np.count_nonzero(test.column('diagnosis') == diagnoses)/len(diagnoses)

In [None]:
test.select('diagnosis').with_column('predicted diagnosis',diagnoses)