In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in Data 8.

def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_table(r):
    """
    Generate a table of 1000 x,y data points in standard units
    whose correlation is approximately equal to r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

In [None]:
def demographics_errors(slope, intercept):
    # Use four convenient points from the original data
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.scatter('College%', 'Median Income', alpha=0.5)
    xlims = make_array(5, 75)
    # Plot a line with the slope and intercept you specified:
    plots.plot(xlims, slope * xlims + intercept, lw=4)
    # Plot red lines from each of the four points to the line
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=4)

In [None]:
def show_demographics_rmse(slope, intercept):
    demographics_errors(slope, intercept)
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", round(mse ** 0.5, 2))

In [None]:
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

## Slope & Intercept

In [None]:
def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)

def correlation(t, x, y):
    """Correlation (r) is the Mean Product of the Standard Values for x and y"""
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    r = np.mean(x_su * y_su)
    return r


In [None]:
# Calculate the slope and intercept of the regression line
# in the original units
def slope(t, x, y):
    '''
    t is a table; x and y are column labels (strings) for numerical data in t
    Returns the slope of the regression line for predicting y values from x values.
    '''
    ...
    
def intercept(t, x, y):
    '''
    Returns the intercept of the regression line for predicting y values from x values.
    '''
    ...

In [None]:
# Test our functions by calling them
# Since the example table's x- and y-columns are in standard units, 
# slope should be about .5 and intercept should be about 0
example = r_table(0.5)
m = slope(example, 'x', 'y')
b = intercept(example, 'x', 'y')
m, b

## Heights data

In [None]:
# Recall the heights data, showing parent heights and adult heights achieved by children
families = Table.read_table('heights.csv')
families

In [None]:
midparent = (families.column(1) + families.column(2)) / 2
heights = Table().with_columns(
    'MidParent', midparent,
    'Child', families.column('child'))
heights

In [None]:
def nn_prediction_heights(h):
    """
    Nearest Neighbors regression:
    Return a prediction of the height of a child 
    whose parents have a midparent height of h.
    
    The prediction is the average height of the children 
    whose midparent height is in the range h plus or minus 0.5 inches.
    """
    neighbors = heights.where(
        'MidParent', are.between(h - 0.5, h + 0.5))
    return np.mean(neighbors.column('Child'))

In [None]:
heights_with_predictions = heights.with_column(
    'NN prediction', 
    heights.apply(nn_prediction_heights, 'MidParent'))
heights_with_predictions

In [None]:
heights_slope = slope(heights, 'MidParent', 'Child')
heights_intercept = intercept(heights, 'MidParent', 'Child')
heights_slope, heights_intercept

In [None]:
# Notice that the linear preduction uses the form "mx + b"
heights_with_predictions = heights_with_predictions.with_column(
    'Linear Prediction', 
    heights_slope * heights.column('MidParent') + heights_intercept
)
heights_with_predictions

In [None]:
heights_with_predictions.scatter('MidParent')

## Discussion Question 1

In [None]:
# Using the lienar regression equation
r = .75
SD_Y = 12
SD_X = 10
m = r * SD_Y / SD_X
mean_Y = 50
mean_X = 70
b = mean_Y - m * mean_X

x = 90
# What's the prediction?
...

### Error in Estimation

In [None]:
# We have a new dataset concerning the 2016 Presidential Election (H Clinton vs. D Trump)
demographics_5_columns = Table.read_table('district_demographics2016.csv')
print("Number of congressional districts in the U.S. is", demographics_5_columns.num_rows)
demographics_5_columns.show(10)

In [None]:
# Notice: There is one row per congressional district
# The rows do not represent individual people

# Use two numerical columns for a regression example -- the median household income
# in the district, and the percent of adults who have graduated from college
demographics = demographics_5_columns.select(4, 2)
demographics.show(5)

In [None]:
# Before even thinking about regression, draw a scatterplot to see visually
# if there is an association between the variables
demographics.scatter('College%', 'Median Income')

What do we learn from this scatterplot?
  - form
  - pattern
  - strength
  - overall takeaway in the context of the data

In [None]:
# Guess the correlation before running this cell
correlation(demographics, 'College%', 'Median Income')

In [None]:
regression_slope = slope(demographics, 'College%', 'Median Income')
regression_intercept = intercept(demographics, 'College%', 'Median Income')
regression_slope, regression_intercept

In [None]:
# Recall, fitted_values takes a table and two column labels
# it returns the array of regression estimates (linear regression)
predicted = fitted_values(demographics, 'College%', 'Median Income')
predicted[:10]

In [None]:
# Add the predictions to our table and make a scatterplot
demographics = demographics.with_column(
    'Linear Prediction', predicted)
demographics.scatter('College%')

In [None]:
# The errors in our predictions are actual values minus predicted values
actual = demographics.column('Median Income')
errors = actual - predicted

In [None]:
# Add the errors to our table
demographics.with_column('Error', errors)

In [None]:
# What's the error, on average? Why?
np.mean(errors)

In [None]:
# Here is the mean squared error, square-rooted 
# This is the usual way to quantify how well the regression line fits the data
# A LOWER R.M.S.E. (root mean square error) is BETTER
np.mean(errors ** 2) ** 0.5

In [None]:
# Visualize some errors
demographics_errors(regression_slope, regression_intercept)

In [None]:
# takes any slope, any intercept

demographics_errors(1500, 20000)

In [None]:
demographics_errors(-1000, 75000)

### Root Mean Square Error ###

In [None]:
show_demographics_rmse(-1000, 75000)

In [None]:
show_demographics_rmse(1500, 20000)

In [None]:
show_demographics_rmse(regression_slope, regression_intercept)

### How to find the best-fitting line?

To find the best-fitting line, we need to choose the slope and intercept that minimize RMSE. Using calculus, folks have proven that the linear regression formulas we've been using actually do this.

The "least squares" regression line (the one that minimizes RMSE) is exactly the "linear regression" line we've already learned to calculate.