In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in STOR 120


def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_scatter(r):
    """Generate a scatter plot with a correlation approximately r"""
    plots.figure(figsize=(5,5))
    x, y = make_correlated_data(r)
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)
    
def r_table(r):
    """
    Generate a table of 1000 data points with a correlation approximately r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

## Functions from Last Class

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

## Correlation ##

In [None]:
HighPeaks = Table.read_table('HighPeaks.csv')
HighPeaks

In [None]:
HighPeaks.scatter('Ascent', 'Time')

In [None]:
HighPeaks_AT = HighPeaks.select('Ascent', 'Time')

HighPeaks_AT = HighPeaks_AT.with_columns(
    'Ascent (standard units)',  standard_units(HighPeaks.column('Ascent')), 
    'Time (standard units)', standard_units(HighPeaks.column('Time'))
)

HighPeaks_AT.scatter('Ascent (standard units)', 'Time (standard units)')

In [None]:
correlation(HighPeaks, 'Ascent', 'Time')

In [None]:
r_scatter(0.3), r_scatter(-0.6), r_scatter(-.9);

## Prediction lines

In [None]:
example = r_table(-0.9)
example.show(3)

In [None]:
example.scatter('x', 'y')

In [None]:
example.scatter('x', 'y')
resize_window()

In [None]:
example.scatter('x', 'y')
plots.plot([-1.25, -1.25], [-3, 3], color='red', lw=2)
plots.plot([-0.75, -0.75], [-3, 3], color='red', lw=2)
plots.scatter(-1,1, s=30, color='gold')
resize_window()

In [None]:
def nn_prediction_example(x_val):
    """ Predicts y-value for x based on the example table """
    neighbors = example.where('x', are.between(x_val - .25, x_val + .25))
    return np.mean(neighbors.column('y'))
    

In [None]:
nn_prediction_example(-2.25)

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

example

In [None]:
example.scatter('x')
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=-.9, color='dodgerblue')
resize_window()

In [None]:
HighPeaks_AT.scatter('Ascent (standard units)', 'Time (standard units)')

In [None]:
HighPeaks_AT.scatter('Ascent (standard units)', 'Time (standard units)')

draw_line(slope = correlation(HighPeaks, 'Ascent', 'Time'), color='dodgerblue')
resize_window()

## Linear regression: defining the line

In [None]:
def slope(t, x, y):
    """t is a table; x and y are column labels"""
    r = correlation(t, x, y)
    slope_xy = r * np.std(t.column(y)) / np.std(t.column(x))
    return slope_xy

In [None]:
def intercept(t, x, y):
    """t is a table; x and y are column labels"""
    slope_xy = slope(t, x, y)
    intercept_xy = np.average(t.column(y) - slope_xy * np.average(t.column(x)))
    return intercept_xy

In [None]:
slope(HighPeaks, 'Ascent', 'Time')

In [None]:
intercept(HighPeaks, 'Ascent', 'Time')

In [None]:
Time_predict = slope(HighPeaks, 'Ascent', 'Time') * 3000 + intercept(HighPeaks, 'Ascent', 'Time')

print('It will take', np.round(Time_predict, 2), 'hours to climb a mountain with an ascent of 3000 ft')

In [None]:
HighPeaks.scatter('Ascent', 'Time')
draw_line(
    slope = slope(HighPeaks, 'Ascent', 'Time'), 
    intercept = intercept(HighPeaks, 'Ascent', 'Time'), 
    x = make_array(1800, 4500), 
    color='r'
)

In [None]:
HighPeaks.scatter('Ascent', 'Time', fit_line=True)