In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Functions

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)  

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))

def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

def fit(table, x, y):
    """Return the height of the regression line at each x value."""
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * table.column(x) + b

def residual(table, x, y):
    return table.column(y) - fit(table, x, y)

def residual_plot(table, x, y):
    x_array = table.column(x)
    t = Table().with_columns(
            x, x_array,
            'residuals', residual(table, x, y)
        )
    t.scatter(x, 'residuals', color='r')
    xlims = make_array(min(x_array), max(x_array))
    plots.plot(xlims, make_array(0, 0), color='darkblue', lw=4)
    plots.title('Residual Plot')
    
def bootstrap_slope(table, x, y, repetitions):
    
    # For each repetition:
    # Bootstrap the scatter, get the slope of the regression line,
    # augment the list of generated slopes
    slopes = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = table.sample()
        bootstrap_slope = slope(bootstrap_sample, x, y)
        slopes = np.append(slopes, bootstrap_slope)
    
    # Find the endpoints of the 95% confidence interval for the true slope
    left = percentile(2.5, slopes)
    right = percentile(97.5, slopes)
    
    # Slope of the regression line from the original sample
    observed_slope = slope(table, x, y)
    
    # Display results
    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print('Slope of regression line:', observed_slope)
    print('Approximate 95%-confidence interval for the true slope:')
    print(left, right)

def fitted_value(table, x, y, given_x):
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * given_x  + b

def bootstrap_prediction(table, x, y, new_x, repetitions):
    
    # For each repetition:
    # Bootstrap the scatter; 
    # get the regression prediction at new_x; 
    # augment the predictions list
    predictions = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = table.sample()
        bootstrap_prediction = fitted_value(bootstrap_sample, x, y, new_x)
        predictions = np.append(predictions, bootstrap_prediction)
        
    # Find the ends of the approximate 95% prediction interval
    left = percentile(2.5, predictions)
    right = percentile(97.5, predictions)
    
    # Prediction based on original sample
    original = fitted_value(table, x, y, new_x)
    
    # Display results
    Table().with_column('Prediction', predictions).hist(bins=20)
    plots.xlabel('predictions at x='+str(new_x))
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print('Height of regression line at x='+str(new_x)+':', original)
    print('Approximate 95%-confidence interval:')
    print(left, right)

## SleepStudy

In [None]:
SleepStudy = Table.read_table('../Datasets/SleepStudy.csv')
SleepStudy.show(3)

## Can DASScore predict Happiness?

In [None]:
SleepStudy.scatter('DASScore','Happiness', fit_line=True)

In [None]:
correlation(SleepStudy, 'DASScore', 'Happiness')

In [None]:
slope(SleepStudy, 'DASScore', 'Happiness')

In [None]:
intercept(SleepStudy, 'DASScore', 'Happiness')

## Residual Diagnostics

In [None]:
residual_plot(SleepStudy, 'DASScore', 'Happiness')

## What does this say about the population?

**Null Hypothesis.** Slope of true line = 0.

**Alternative Hypothesis.** Slope of true line is not 0.

In [None]:
bootstrap_slope(SleepStudy, 'DASScore', 'Happiness', 5000)

In [None]:
mean_DASScore = np.mean(SleepStudy.column('DASScore'))
mean_DASScore

In [None]:
fitted_value(SleepStudy, 'DASScore', 'Happiness', mean_DASScore)

In [None]:
bootstrap_prediction(SleepStudy, 'DASScore', 'Happiness', mean_DASScore, 5000)

## Minimizing RMSE and extensions to more complex models

In [None]:
def Sleep_DAS_Hap_mse(any_slope, any_intercept):
    x = SleepStudy.column('DASScore')
    y = SleepStudy.column('Happiness')
    fitted = any_slope*x + any_intercept
    return np.mean((y - fitted) ** 2)

In [None]:
minimize(Sleep_DAS_Hap_mse)

In [None]:
slope(SleepStudy, 'DASScore', 'Happiness'), intercept(SleepStudy, 'DASScore', 'Happiness')

In [None]:
def Sleep_DAS_Hap_mse(a, b, c, d, e, f):
    x1 = SleepStudy.column('DASScore')
    x2 = SleepStudy.column('GPA')
    x3 = SleepStudy.column('PoorSleepQuality')
    x4 = SleepStudy.column('Drinks')
    x5 = SleepStudy.column('AverageSleep')
    y = SleepStudy.column('Happiness')
    fitted = a*x1 + b*x2 + c*x3 + d*x4 + e*x5 + f
    return np.mean((y - fitted) ** 2)

In [None]:
coefficients = minimize(Sleep_DAS_Hap_mse)
coefficients

In [None]:
coefficients.item(0)*20 + coefficients.item(1)*3.2 + coefficients.item(2)*19 + \
coefficients.item(3)*4 + coefficients.item(4)*6.5 + coefficients.item(5)