In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 31

## Review: Predicting a child's height

Let's revisit Galton's predictions of children's heights based on their parent's heights...

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )
heights

In [None]:
def predict_child(h):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of h.
    
    The prediction is the average height of the children 
    whose midparent height is in the range h plus or minus 0.25 inches.
    """
    
    close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))
    return close_points.column('Child').mean()   

In [None]:
# predict the height for each child in the data set
heights_with_predictions = heights.with_column(
    'Prediction', heights.apply(predict_child, 'MidParent')
)

In [None]:
# visualize the predicted heights
heights_with_predictions.scatter('MidParent')

### Review: The correlation coefficient

In [None]:
# Convert data to standard units
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x))/np.std(x)  

In [None]:
# we can create a function to calculate the correlation coefficient
def correlation(t, label_x, label_y):
    x_in_standard_units = standard_units(t.column(label_x))
    y_in_standard_units = standard_units(t.column(label_y))
    return np.sum(x_in_standard_units * y_in_standard_units)/(t.num_rows - 1)

In [None]:
# correlation of Galton heights
correlation(heights, 'MidParent', 'Child')

## Linear regression

In [None]:
# original scatter plot of the data and the correlation for the Galton data


In [None]:
# predictions made by taking average of children's heights in a neighborhood


In [None]:
# predictions made by the regression line


In [None]:
# comparing prediction of the regression line and average in a neighborhood


## Regression in standardized units

In [None]:
# Let's look at the relationship in standaridized units (z-score transformed units)
heights_standardized = Table().with_columns(
    "MidParent", standard_units(heights.column("MidParent")),
    "Child", standard_units(heights.column("Child"))
)

heights_standardized

In [None]:
# plot a regression line on the standardized data


In [None]:
# Correlation between children's and parent's heights


In [None]:
# predictions are less than the identity line -> regression to the mean



In [None]:
# function to calculate the slope 
def slope(t, x, y):
    ...

    
    


In [None]:
# slope for predicting child's height

# Q: for every additional inch a parent is taller, how much taller is the predicted child's height?

In [None]:
# function to calculate the intercept
def intercept(t, x, y):
    ...


In [None]:
# intercept for predicting child's height


#Q: How a parents that are 0" tall, how tall is the predicted height of their child? 

### Regression equation for Galton data





In [None]:
# How tall would be predict a child to be if their parents were 70 inches?



## Regression Line 

In [None]:
# Helper function: Draw a line for a given slope, intercept, and possibly input location x
def draw_line(slope=0, intercept=0, x=None, color='r'):
    if x is None:
        x1, x2, y1, y2 = plots.gca().axis()
    x = make_array(x1, x2)
    y = x*slope + intercept
    plots.plot(x, y, color=color)

In [None]:
# Helper function: Produces a plot we will use for a dataset in this demo to visualize errors
def demographics_errors(slope, intercept):
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.scatter('College%', 'Median Income', alpha=0.5)
    xlims = make_array(5, 75)
    plots.plot(xlims, slope * xlims + intercept, lw=4)
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=4)

In [None]:
# load demographic data 
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)

In [None]:
# select only the 'College%', 'Median Income' columns



In [None]:
# Calculate the correlation


In [None]:
# Calculate the slope and intercept




In [None]:
# create a function that estimates all the predictions (fitted values)
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    ...
    
    

In [None]:
# Predicted median incomes (y-hat values)


In [None]:
# Plot the linear predictions




In [None]:
# What are the errors between the actual observations and the predicted values?




In [None]:
# Root mean squared error (RMSE)


In [None]:
# Visualize errors for estimated line


In [None]:
# Try any slope, any intercept


In [None]:
# Try any slope, any intercept


### Root Mean Square Error ###

In [None]:
# function to calculate the RMSE for the demographic data
def demographics_rmse(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
# Helper function to visualize the demographic data and print the MSE
def show_demographics_rmse(slope, intercept):
    demographics_errors(slope, intercept)
    rmse = demographics_rmse(slope, intercept)
    print("Root mean squared error:", rmse ** 0.5)

In [None]:
# show the demographic plot and the RMSE for an arbitrary slope and intercept


In [None]:
# show the demographic plot and the RMSE for another arbitrary slope and intercept


In [None]:
# show the demographic plot and the RMSE for an arbitrary slope and intercept


In [None]:
# show the demographic plot and the RMSE for the slope and intercept we found


## Numerical Optimization

In [None]:
# An arbitrary function 
def f(x):
    return ((x-2)**2) + 3

# plot the function
x = np.arange(1, 3, 0.1)
y = f(x)
Table().with_columns('x', x, 'y', y).plot('x')

In [None]:
# find the minimum value of the function 


### Minimizing RMSE ###

In [None]:
# RMSE for arbitrary slope and intercept


In [None]:
# RMSE for another arbitrary slope and intercept


In [None]:
# Minimize the demographic RMSE


In [None]:
# Our regression slope and intercept estimates from earlier


In [None]:
# The minimum RMSE value
