# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*# Importing our modules

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
birds = Table.read_table('snowy_plover.csv')
birds

# The Residual & The Root Mean Square Error

In [None]:
# Let's run this cell 

def x_y_estimate_residual(table, x_col, y_col):
    x_array = table.column(x_col)
    y_array = table.column(y_col)

    x_mean = np.mean(x_array)
    x_std = np.std(x_array)

    y_mean = np.mean(y_array)
    y_std = np.std(y_array)

    x_su = (x_array - x_mean)/x_std
    y_su = (y_array - y_mean)/y_std

    r_value = np.mean(x_su*y_su)
    slope = r_value*(y_std/x_std)
    intercept = y_mean - slope * x_mean
    
    estimate = slope*x_array + intercept
    
    output_table = table.select(x_col,y_col).with_columns('Estimate',estimate,
                                                          'Residual',y_array - estimate)
    print(slope,intercept)
    
    return output_table

def lin_reg_slope_intercept(table,x_col,y_col):
    x_array = birds.column(x_col)
    y_array = birds.column(y_col)

    x_mean = np.mean(x_array)
    x_std = np.std(x_array)

    y_mean = np.mean(y_array)
    y_std = np.std(y_array)

    x_su = (x_array - x_mean)/x_std
    y_su = (y_array - y_mean)/y_std

    r_value = np.mean(x_su*y_su)
    slope = r_value*(y_std/x_std)
    intercept = y_mean - slope * x_mean
    
    return make_array(slope,intercept)

x_y_estimate_residual(birds,'Egg Weight','Bird Weight')

In [None]:
residual = x_y_estimate_residual(birds,'Egg Weight','Bird Weight').column('Residual')
residual

In [None]:
birds_with_residual = birds.with_columns('Residual',residual)
birds_with_residual

## The Root Mean Square Error Formula
<img src="root_mean_square_error.png" alt="drawing" width="1200"/>

In [None]:
error = residual

#### Square the Error

In [None]:
squared_error = error...

#### Take the **mean** of the squared errors

In [None]:
mean_squared_error = squared_error...

#### Take the square root 

In [None]:
root_mean_squared_error = mean_squared_error...

#### Let's put it all together in a one-line calculation

In [None]:
#let's put it all together in a one-line calculation
root_mean_square_error = ...
root_mean_square_error

### Let's define the function `rmse` to calculate the root mean square error

Given any proposed slope & intercept, it will calculate the root mean square error of the estimated column of 'Bird Weights' corresponding to the column of 'Egg Weights'

In [None]:
def rmse(slope,intercept):
    x_array = birds.column('Egg Weight')
    y_array = birds.column('Bird Weight')
    
    prediction = slope*x_array + intercept
    prediction_error = y_array - prediction
    
    return np.sqrt(np.mean(prediction_error**2))

In [None]:
m_b_array = lin_reg_slope_intercept(birds,'Egg Weight','Bird Weight')
m_slope = m_b_array.item(0)
b_intercept = m_b_array.item(1)
print(m_slope,b_intercept)

In [None]:
rmse(m_slope,b_intercept)

In [None]:
test_m = 2
test_b = 1
print(rmse(2,1))

graph_x_min = min(birds.column('Egg Weight'))-.2
graph_x_max = max(birds.column('Egg Weight'))+.2

graph_y_min = test_m*graph_x_min +test_b
graph_y_max = test_m*graph_x_max +test_b

birds.scatter('Egg Weight','Bird Weight')
plt.plot(make_array(graph_x_min,graph_x_max),make_array(graph_y_min,graph_y_max))

In [None]:
rmse(.075,0.5)

In [None]:
rmse(.025,1.5)

In [None]:
rmse(5,10)

In [None]:
rmse(2,1)

In [None]:
slopes = make_array()
intercepts = make_array()
these_rmse = make_array()

for m in np.arange(0,2,0.1):
    for b in np.arange(-1,1,0.1):
        
        #storing the slope-intercept pair into their respective arrays
        slopes = np.append(slopes,m)
        intercepts = np.append(intercepts,b)
        
        # calculates the root mean square error of the the slope-intercept pair above
        this_rmse = rmse(m,b)
        
        #stores the resulting rmse into it's own array
        these_rmse = np.append(these_rmse,this_rmse)

#combining the three arrays above into a table
slope_intercept_residual_table = Table().with_columns('Slopes (m)',slopes,
                                    'Intercepts (b)', intercepts,
                                    'RMSE',these_rmse)
slope_intercept_residual_table.show()
slope_intercept_residual_table.num_rows

In [None]:
slopes = make_array()
intercepts = make_array()
these_rmse = make_array()

for m in np.arange(0,2,0.01):
    for b in np.arange(-1,1,0.01):
        this_rmse = rmse(m,b)
        
        slopes = np.append(slopes,m)
        intercepts = np.append(intercepts,b)
        these_rmse = np.append(these_rmse,this_rmse)

slope_intercept_residual_table = Table().with_columns('Slopes (m)',slopes,
                                    'Intercepts (b)', intercepts,
                                    'RMSE',these_rmse)
slope_intercept_residual_table.show()
slope_intercept_residual_table.num_rows

In [None]:
slope_intercept_residual_table.sort('RMSE')

In [None]:
len(slope_intercept_residual_table.column('RMSE'))

In [None]:
sorted_slope_intercept_residual_table = slope_intercept_residual_table.sort('RMSE')

slopes_x = sorted_slope_intercept_residual_table.column('Slopes (m)')
intercepts_y = sorted_slope_intercept_residual_table.column('Intercepts (b)')
residual_colors = sorted_slope_intercept_residual_table.column('RMSE')


plt.figure(figsize=(6,6))
plt.scatter(slopes_x, intercepts_y, c = residual_colors, cmap="Blues_r")
plt.colorbar()
plt.xlabel('Slope')
plt.ylabel('Intercept') 

# The `minimize` function and Optimization

In [None]:
minimize(rmse)

In [None]:
#re-printing what we originally calculated using the formulas above
print(m_slope,b_intercept)

# The Distance Formula

In [None]:
iris =  Table.read_table("IRIS.csv")
iris

In [None]:
#let's pick two of the column labels above and observe the scatter plot. 
# Note that each of the three species will be a different color on the plot to help us differentiate 
# the iris flower species

iris.scatter('sepal_length' , 'sepal_width' , group = 'species')

In [None]:
#Now let's look at comparing ALL of the labels to each other in pairwise comparisons
#run the code below 

labels_without_species = list(iris.labels)[:-1]
plotted_pairs = []

for x in labels_without_species:
    for y in labels_without_species:
        
        ordered_pair =  {x,y}
        
        if (x != y) and (ordered_pair not in plotted_pairs):
            iris.scatter(x,y,group='species')
            plotted_pairs.append(ordered_pair)
            
print(plotted_pairs)   

In [None]:
((3 - 0)**2 + (4 - 0)**2)**(0.5)

In [None]:
first = make_array(0,0,0,0)
second = make_array(3,4,12,25)

(sum((first - second)**2))**(1/2)

#### The following function below is defined for you homework 12

In [None]:
# row (input): a row from the table 
# features (input): an array of column labels. These labels are the attributes that will help us classify individuals. 
# Note: the attributes must be numerical to help us pass them through the distance function defined above. 

def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

In [None]:
iris

In [None]:
iris.take(make_array(0,1,-1))

In [None]:
array_of_iris_features = iris.drop('species').labels

print(array_of_iris_features)

first_setosa = row_to_array(iris.row(0),array_of_iris_features)
second_setosa = row_to_array(iris.row(1),array_of_iris_features)
last_virginica = row_to_array(iris.row(-1),array_of_iris_features)

(sum((first_setosa - last_virginica)**2))**(0.5)

#### Let's convert the process into a single function

In [None]:
def distance(array_one,array_two):
    return (sum((array_one - array_two)**2))**(0.5)

In [None]:
distance(first_setosa,second_setosa)

In [None]:
distance(first_setosa,last_virginica)

In [None]:
distance(second_setosa,last_virginica)