In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 10: regression demo

In [None]:
def draw_line(slope=0, intercept=0, x=None, color='r'):
    if x is None:
        x1, x2, y1, y2 = plots.gca().axis()
    x = make_array(x1, x2)
    y = x*slope + intercept
    plots.plot(x, y, color=color)

### Regression Line ###

In [None]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

In [None]:
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def predict_y(x_val):
    """
    Predicts y-values using nearest neighbors
    """
    nearby_points = data.where('x', are.between(x_val-0.25, x_val + 0.25))
    return np.mean(nearby_points.column('y'))

## Look at Data

In [None]:
#Generate a fake dataset
np.random.seed(123)
num_points = 100
slope_true = 4
intercept_true = 10
x = np.random.normal(0, 1, num_points)
error = np.random.normal(0, 4, num_points)
y = slope_true*x + intercept_true + error
data = Table().with_columns('x', x, 'y', y)

In [None]:
data.scatter("x", "y")

In [None]:
correlation(data,"x", "y")

In [None]:
data = data.with_column('Predicted y', data.apply(predict_y, 'x'))

In [None]:
data.scatter('x')

## Fit regression model

In [None]:
regression_slope = slope(data, 'x', 'y')
regression_intercept = intercept(data, 'x', 'y')
(regression_slope, regression_intercept)

In [None]:
predicted = fitted_values(data, 'x', 'y')

In [None]:
data = data.with_column(
    'Linear Prediction', predicted)
data.scatter('x')

## Calculate errors

In [None]:
actual = data.column('y')
errors = actual - predicted

In [None]:
data = data.with_column('Error', errors)

In [None]:
np.mean(errors ** 2) ** 0.5

## Root Mean Square Error

In [None]:
def show_data_rmse(slope, intercept):
    x = data.column('x')
    y = data.column('y')
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", mse ** 0.5)

In [None]:
show_data_rmse(1, 0)

In [None]:
show_data_rmse(5, 6)

In [None]:
show_data_rmse(4, 12)

In [None]:
show_data_rmse(regression_slope, regression_intercept)

## Numerical Optimization 

In [None]:
x = np.arange(1, 3, 0.1)
y = (x-2)**2 + 3
Table().with_column('x', x, 'y', y).plot('x')

In [None]:
def f(x):
    return ((x-2)**2) + 3

In [None]:
minimize(f)

### Minimizing RMSE ###

In [None]:
def data_rmse(any_slope, any_intercept):
    x = data.column('x')
    y = data.column('y')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
data_rmse(5, 10)

In [None]:
data_rmse(1, 0)

In [None]:
minimize(data_rmse)

In [None]:
make_array(regression_slope, regression_intercept)