In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in STOR 120

def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y
    
def r_table(r):
    """
    Generate a table of 1000 data points with a correlation approximately r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

## Linear regression: defining the line

In [None]:
def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)

In [None]:
def correlation(t, x, y):
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)

In [None]:
def slope(t, x, y):
    """t is a table; x and y are column labels"""
    r = correlation(t, x, y)
    slope_xy = r * np.std(t.column(y)) / np.std(t.column(x))
    return slope_xy

In [None]:
def intercept(t, x, y):
    """t is a table; x and y are column labels"""
    slope_xy = slope(t, x, y)
    intercept_xy = np.average(t.column(y) - slope_xy * np.average(t.column(x)))
    return intercept_xy

In [None]:
another_example = r_table(-0.7)

ae_slope = slope(another_example, 'x', 'y')
ae_intercept = intercept(another_example, 'x', 'y')

In [None]:
another_example.scatter('x', 'y')
draw_line(
    slope = ae_slope, 
    intercept = ae_intercept, 
    x = make_array(-3.5, 3.5), 
    color='r'
)

In [None]:
another_example.scatter('x', 'y', fit_line=True)

## Prediction

In [None]:
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)

In [None]:
demographics = demographics.drop(
    'State', 'District', 'Percent voting for Clinton')
demographics.show(5)

In [None]:
demographics.scatter('College%', 'Median Income')

In [None]:
correlation(demographics, 'College%', 'Median Income')

In [None]:
regression_slope = slope(demographics, 'College%', 'Median Income')
regression_intercept = intercept(demographics, 'College%', 'Median Income')
regression_slope, regression_intercept

In [None]:
# For 50% College 

regression_intercept + regression_slope*50

In [None]:
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
predicted = fitted_values(demographics, 'College%', 'Median Income')

In [None]:
demographics = demographics.with_column(
    'Linear Prediction', predicted)
demographics.scatter('College%')

## Errors

In [None]:
actual = demographics.column('Median Income')
errors = actual - predicted

In [None]:
demographics.with_column('Error', errors)

In [None]:
np.mean(errors)

In [None]:
np.mean(errors ** 2) ** 0.5

In [None]:
def demographics_errors(slope, intercept):
    # Use four convenient points from the original data
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.scatter('College%', 'Median Income', alpha=0.5)
    xlims = make_array(5, 75)
    # Plot a line with the slope and intercept you specified:
    plots.plot(xlims, slope * xlims + intercept, lw=4)
    # Plot red lines from each of the four points to the line
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=4)

In [None]:
demographics_errors(regression_slope, regression_intercept)

In [None]:
# takes any slope, any intercept

demographics_errors(1500, 20000)

In [None]:
demographics_errors(-1000, 75000)

## Numerical Optimization ###

In [None]:
x = np.arange(1, 3, 0.1)
y = (x-2)**2 + 3
Table().with_columns('x', x, 'y', y).plot('x')

In [None]:
def f(x):
    return ((x-2)**2) + 3

In [None]:
minimize(f)

In [None]:
x = np.arange(-1.5, 1.5, 0.05)
y2 = 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 
Table().with_columns('x', x, 'y', y2).plot('x')

In [None]:
def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 

In [None]:
minimize(complicated_function)

### Minimizing RMSE ###

In [None]:
def demographics_rmse(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
demographics_errors(1500, 20000)
demographics_rmse(1500, 20000)

In [None]:
demographics_errors(-1000, 75000)
demographics_rmse(-1000, 75000)

In [None]:
minimize(demographics_rmse)

In [None]:
make_array(regression_slope, regression_intercept)