In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(arr):
    return (arr - np.average(arr)) / np.std(arr)


def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)


def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd


def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y) * x_mean


def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b


def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

## Regression Model ##

In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns("x", x, "y", y)

    sample.scatter("x", "y")
    plots.plot(xlims, true_slope * xlims + true_int, lw=2, color="green")
    plots.title("True Line, and Points Created")

    sample.scatter("x", "y")
    plots.title("What We Get to See")

    sample.scatter("x", "y", fit_line=True)
    plots.title("Regression Line: Estimate of True Line")

    sample.scatter("x", "y", fit_line=True)
    plots.plot(xlims, true_slope * xlims + true_int, lw=2, color="green")
    plots.title("Regression Line and True Line")

## Prediction ##

In [None]:
# Preterm and postterm pregnancy cutoffs, according to the CDC
37 * 7, 42 * 7

In [None]:
# You don't need to understand the plotting code in this cell,
# but you should understand the figure that comes out.

plots.figure(figsize=(10, 11))
plots.subplot(3, 2, 1)
plots.scatter(births[1], births[0], s=10, color="darkblue")
plots.xlim([225, 325])
plots.title("Original sample")

for i in np.arange(1, 6, 1):
    plots.subplot(3, 2, i + 1)
    resampled = births.sample()
    plots.scatter(
        resampled.column("Gestational Days"),
        resampled.column("Birth Weight"),
        s=10,
        color="tab:green",
    )
    plots.xlim([225, 325])
    plots.title("Bootstrap sample " + str(i))
plots.tight_layout()

## Confidence Interval for Prediction ##

## Predictions at Different Values of X

## Inference for the Slope ##

## Rain on the Regression Parade

**Null Hypothesis.** Slope of true line = 0.

**Alternative Hypothesis.** Slope of true line is not 0.

## Nonlinear regression

## Multiple regression

What if we try to predict birth weight from gestational days and maternal height?

In [None]:
# This means that our prediction for birth weight is:
# 1.658 * (maternal height in inches) + 0.52 * (number of gestational days) +  - 131.5 ounces

In [None]:
# Remember, RMSE for linear prediction = SD of residuals


In [None]:
# Remember, RMSE for linear prediction = SD of residuals
