In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 33

In [None]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def get_fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def get_residuals(t, x, y):
    predictions = get_fitted_values(t, x, y)
    return t.column(y) - predictions

## Root Mean Square Error

In [None]:
# load demographic data 
demographics = Table.read_table('district_demographics2016.csv')
demographics = demographics.select('College%', 'Median Income')
demographics.show(5)

In [None]:
# Calculate the root mean squared error (RMSE) for the actual residuals
RMSE = np.mean(get_residuals(demographics, "College%", "Median Income") ** 2) ** 0.5
RMSE

In [None]:
# function to calculate the RMSE for any splot and intercept for the demographic data
def demographics_rmse(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
# show the demographic plot and the RMSE for an arbitrary slope and intercept

example_slope = 1500          #  -1000     500
example_intercept = 2000      #  75000     20000

demographics_rmse(example_slope, example_intercept)

## Numerical Optimization

In [None]:
# An arbitrary function 
def f(x):
    return ((x-2)**2) + 3

# plot the function
x = np.arange(1, 3, 0.1)
y = f(x)
Table().with_columns('x', x, 'y', y).plot('x')

In [None]:
# find the minimum value of the function 
minimize(f)

In [None]:
# RMSE for arbitrary slope and intercept
demographics_rmse(1500, 20000)

In [None]:
# Minimize the demographic RMSE
minimize(demographics_rmse)

What is the regression equation for predicting a county's median income based on the percentage of the population that went to college? 

Answer:

$$
\hat{y} ~ = ~ 1270.7 \cdot x ~ + ~ 20802.6
$$


## Regression diagnostic plots

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

In [None]:
# A function that creates two plots: 
# 1) A scatter plot of the data and the fitted values
# 2) A plot of the residuals as a function of the fitted values
def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Predicted', get_fitted_values(t, x, y),
        'Residual', get_residuals(t, x, y)
    )
    tbl.select(x, y, 'Predicted').scatter(0)
    tbl.scatter(x, 'Residual')

In [None]:
# Let's apply the function to the Galton data
plot_residuals(heights, 'MidParent', 'Child')

### US women heights and average weights

In [None]:
# Height and average weight of US women
us_women = Table.read_table('us_women.csv')
us_women.show(5)

In [None]:
# correlation of height and weight
correlation(us_women, 'height', 'ave weight')

In [None]:
# create the residual plots
plot_residuals(us_women, 'height', 'ave weight')

## Polynomial regression

**Quadratic Function**

$$
f(x) ~=~ ax^2 + bx + c
$$
for constants $a$, $b$, and $c$.

In [None]:
# A function that returns the RMSE for a quadratic fit to the us women data
def us_women_quadratic_rmse(a, b, c):
    x = us_women.column('height')
    y = us_women.column('ave weight')
    estimate = a*(x**2) + b*x + c
    return np.mean((y - estimate) ** 2) ** 0.5

In [None]:
# get the quadratic fit coefficients
best_quad = minimize(us_women_quadratic_rmse)
best_quad

Can you fill in the coefficients in the prediction equation? 

$$
f(x) ~=~ ax^2 + bx + c
$$

In [None]:
# What is the predicted average weight for someone who is 65 inches?
best_quad.item(0) * 65**2   +   best_quad.item(1) * 65   +    best_quad.item(2)


In [None]:
# create the fitted values for quadratic
heights = us_women.column('height')
fitted_values = best_quad.item(0)*(heights**2) + best_quad.item(1)*heights + best_quad.item(2)
fitted_values

In [None]:
# plot the fitted values for the quadratic predictions
us_women.with_columns("predicted weight", fitted_values).scatter("height")

In [None]:
# calculate the residuals
residuals = us_women.column('ave weight') - fitted_values
residuals

In [None]:
# plot the residuals
Table().with_columns("height", us_women.column('height'),
                     "residuals", residuals
).scatter("height")

In [None]:
# Are higher order terms needed???
# Try a degree 5 polynomial at home!

def us_women_5_rmse(a, b, c, d, e, f):
    x = us_women.column('height')
    y = us_women.column('ave weight')
    estimate = a*(x**5) + b*(x)**4  + c*(x)**3 + d*(x)**2  + e*x + f
    return np.mean((y - estimate) ** 2) ** 0.5

best_order_5 = minimize(us_women_5_rmse)

fitted_values5 = best_order_5.item(0)*(heights**5) + best_order_5.item(1)*(heights**4) + best_order_5.item(2)*(heights**3) +  best_order_5.item(3)*(heights**2) + + best_order_5.item(4) * heights + best_order_5.item(5)

us_women.with_columns("predicted weight", fitted_values5).scatter("height")

In [None]:
residuals = us_women.column('ave weight') - fitted_values5
residuals

Table().with_columns("height", us_women.column('height'),
                     "residuals", residuals
).scatter("height")

## Regression and correlation relationships

No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. When the least squares regression line is used, the fraction is |r|.

$$
\frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \mbox{That is,} ~~ \mbox{SD of fitted values} = |r|\cdot \mbox{SD of }y
$$

In [None]:
# load data on fruits
fruit = Table.read_table('fruit_baskets.csv')
fruit.show(3)

In [None]:
# create a scatter plot visualizing the weight of fruits as a function fo the number of clementines
fruit.scatter('Clementines', 'Weight')

Let's show that:

$$
\frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r|
$$

In [None]:
# Get the standard deviation of our y variable (Weight)
sd_y = np.std(fruit.column("Weight"))
sd_y

In [None]:
# Get the standard deviation of the fitted values (from predicting weight from the number of clementines)
sd_fitted = np.std(get_fitted_values(fruit, 'Clementines', 'Weight'))
sd_fitted

In [None]:
# calculate standard deviation of the residuals over the total standard deviation of weights
sd_fitted/sd_y

In [None]:
# Note this matches the correlation value
corr_val = correlation(fruit, 'Clementines', 'Weight')
corr_val

Let's show that:

$$
(\text{SD y})^2 ~ = ~ (\text{residuals})^2 ~ + ~ (\text{SD fitted values})^2
$$

In [None]:
# Get the standard deviation of the residuals (from predicting weight from the number of clementines)
sd_residuals = np.std(get_residuals(fruit, 'Clementines', 'Weight'))
sd_residuals

In [None]:
# Let's calculate the variance of the fitted values 
sd_fitted**2 + sd_residuals**2

In [None]:
# Let's calculate the variance of y
sd_y**2

## Regression Model 

Let's examine the relationship between:

- True regression line that captures the linear relationship between two variables (green line)
- A random sample of n points that come from the underlying linear relationship plus random noise off the regression line
- A line fit to the sample of points that approximates the true regression line (i.e., the "line best fit" shown in blue)

To do this we will use the function `draw_and_compare` defined below that takes three arguments:

1. The true slope of a linear relationship between our variables
2. The true y-intercept of a linear relationship between our variables
3. A sample size (n) of random points that will be used to calculate the "line of best fit"



In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter('x', 'y')
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter('x', 'y')
    plots.title('What We Get to See')

    sample.scatter('x', 'y', fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter('x', 'y', fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")

In [None]:
# have a true slope of 2, an true intercept of -5 and draw 10 random points
draw_and_compare(2, -5, 10)

In [None]:
# have a true slope of 2, an true intercept of -5 and draw 100 random points
draw_and_compare(2, -5, 100)

## Bootstrap slopes, intercepts and regression lines

In [None]:
# take a random sample (with replacement) from our original fruit sample and fit a regression line
fruit_sample = fruit.sample()
fruit_sample.scatter('Clementines', 'Weight', fit_line = True)

In [None]:
# create a bootstrap distribution for the slope and intercept
bootstrap_slopes = make_array()
bootstrap_intercepts = make_array()

for i in np.arange(1000):
    
    fruit_sample = fruit.sample()
    bootstrap_slopes = np.append(bootstrap_slopes, slope(fruit_sample, 'Clementines', 'Weight'))
    bootstrap_intercepts = np.append(bootstrap_intercepts, intercept(fruit_sample, 'Clementines', 'Weight'))


In [None]:
# visualize all the bootstrap lines
xlims = make_array(14, 28)

for i in np.arange(len(bootstrap_slopes)):
    plots.plot(xlims, bootstrap_slopes[i] * xlims + bootstrap_intercepts[i], lw=1, color = "blue", alpha = 1);
    plots.xlabel("Clementines")
    plots.xlabel("Weight")

In [None]:
# create a 95% confidence interval for the regression slope
bootstrap_CI = make_array(percentile(2.5, bootstrap_slopes), percentile(97.5, bootstrap_slopes))
bootstrap_CI

In [None]:
# visualize the bootstrap distribution
Table().with_column("Bootstrap slopes", bootstrap_slopes).hist("Bootstrap slopes")
plots.plot(bootstrap_CI, [0, 0], color='gold', lw=18);

# Question:
#  Is a slope of 0 plausible?  
#  i.e, no linear association between the number of Clementines and Weight?


### Question: could you run a hypothesis test assessing whether the regression slope is 0? 

In [None]:
# create a null distribution 

null_fruit_slopes = make_array()

for i in np.arange(1000):
    fruit_shuff = fruit.with_column('Weight', fruit.sample(with_replacement = False).column("Weight"))
    null_fruit_slopes =  np.append(null_fruit_slopes, slope(fruit_shuff, 'Clementines', 'Weight'))



In [None]:
Table().with_column("Null slopes", null_fruit_slopes).hist("Null slopes", bins = np.arange(-.1, .1, .01))
real_slope = slope(fruit, 'Clementines', 'Weight')
real_slope