In [39]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')

## Lecture 34

In [40]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def get_fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def get_residuals(t, x, y):
    predictions = get_fitted_values(t, x, y)
    return t.column(y) - predictions

## Regression Model 

Let's examine the relationship between:

- True regression line that captures the linear relationship between two variables (green line)
- A random sample of n points that come from the underlying linear relationship plus random noise off the regression line
- A line fit to the sample of points that approximates the true regression line (i.e., the "line best fit" shown in blue)

To do this we will use the function `draw_and_compare` defined below that takes three arguments:

1. The true slope of a linear relationship between our variables
2. The true y-intercept of a linear relationship between our variables
3. A sample size (n) of random points that will be used to calculate the "line of best fit"



In [72]:
def draw_and_compare(true_slope, true_int, sample_size):
    
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter('x', 'y')
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter('x', 'y')
    plots.title('What We Get to See')

    sample.scatter('x', 'y', fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter('x', 'y', fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")

In [73]:
# have a true slope of 2, an true intercept of -5 and draw 10 random points


In [74]:
# have a true slope of 2, an true intercept of -5 and draw 100 random points


## Bootstrap slopes, intercepts and regression lines

In [None]:
# load data on fruits
fruit = Table.read_table('fruit_baskets.csv')
fruit.show(3)

In [75]:
# take a random sample (with replacement) from our original fruit sample and fit a regression line



In [76]:
# create a bootstrap distribution for the slope and intercept







In [77]:
# visualize all the bootstrap lines







In [78]:
# create a 95% confidence interval for the regression slope



In [79]:
# visualize the bootstrap distribution





# Question:
#  Is a slope of 0 plausible?  
#  i.e, no linear association between the number of Clementines and Weight?


### Question: could you run a hypothesis test assessing whether the regression slope is 0? 

In [80]:
# create a null distribution 





In [81]:
# visualize the null distribution and compare it to slope calculated on the real data




## Classification

In [None]:
# Can you tell if a bank note is counterfeit or legitimate?
# Variables based on photgraphs of many banknotes (a few numbers for each image calculated)

banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
# Visualize 'WaveletVar' and 'WaveletCurt'


In [None]:
# Visualize 'WaveletSkew', 'Entropy'


In [None]:
# Two attributes have some overlap of classes...what happens with three attributes?
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)

