In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Download Galton's data
galton = pd.read_csv('https://raw.githubusercontent.com/abby621/cs1070_materials/master/fl2019/demos/galton.csv')
galton

In [None]:
father_heights = galton['father']
mother_heights = galton['mother']
midparent_heights = galton['midparentHeight']
child_heights = galton['childHeight']

In [None]:
plt.scatter(father_heights, child_heights)

In [None]:
plt.scatter(mother_heights, child_heights)

In [None]:
plt.scatter(midparent_heights, child_heights)

In [None]:
def inds_between(values,low,high):
    
    values_array = np.array(values) # arrays are like a list, but support more mathematical operations
    
    above_low = values_array>=low # will be True for all values in the array that are >= low
    
    below_high = values_array<=high # will be True for all values in the array that are <= high
    
    is_between = above_low * below_high # booleans can be interpreted as integers (1 for True and 0 for False) 
    # so this multiplication will only be 1 where the numbers are True (or 1) when both above_low and below_high 
    # are True (or 1)
    
    between_indices = np.where(is_between)[0] # finds the places where is_between is true
    return between_indices

my_values = [1,3,5,9]
between_inds = inds_between(my_values,4,12)

print('indices that are between 4 and 12: ', between_inds)
for ind in between_inds:
    print(my_values[ind])

In [None]:
def predict_child(midparent):
    close_inds = values_between(midparent_heights,midparent-0.5, midparent+0.5)
    child_heights_for_close_parents = np.array(child_heights)[close_inds]
    predicted_height = child_heights_for_close_parents.mean()
    return predicted_height

In [None]:
predicted_heights = []
for mp_height in np.arange(64,76,0.1):
    predicted_height = predict_child(mp_height)
    predicted_heights.append((mp_height, predicted_height))
    
print(predicted_heights)

In [None]:
# make our scatter plot of midparent heights vs. child heights
plt.scatter(midparent_heights, child_heights)

# plot our "child predictions" in yellow
for midparent, child in predicted_heights:
    plt.scatter(midparent,child, color='gold')

# show the "window" that we considered for a midparent height of 68
plt.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plt.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plt.scatter(68, 66.24, color='magenta', s=40)

In [None]:
# how do we get a line that doesn't wobble up and down so much?
# back to slides about linear regression

In [None]:
# In order to compare data with very different units, we can convert them to
# a "Standard Unit", where the data is centered at 0
# and the Standard Deviation (how much the data varies from the mean) is 1
def standard_units(nums):
    mean = np.mean(nums)
    std = np.std(nums)
    return (nums - mean) / std

# The "correlation coefficient" (r) is a measure of how correlated x and y are
# -1: perfect negative correlation
# 0: uncorrelated
# 1: perfect positive correlation
def correlation_coeff(x, y):
    standarized_x = standard_units(x)
    standardized_y = standard_units(y)
    r = np.mean(standarized_x * standardized_y)
    return r

In [None]:
from ipywidgets import interact

def visualize_corr_coeff(r):
    plt.figure(figsize=(5,5))
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x,y)
    plt.xlim(-4,4)
    plt.ylim(-4,4)
    plt.show()

import ipywidgets as widgets
_ = interact(visualize_corr_coeff, r=(-1,1,0.1))

In [None]:
standard_midpartent_heights = standard_units(midparent_heights)
standard_child_heights = standard_units(child_heights)

plt.scatter(standard_midpartent_heights,standard_child_heights)
plt.xlim(-4,4)
plt.ylim(-4,4)
# draw a line with slope = 1
plt.plot([-4,4],[-4,4], color='r',lw=2)

# draw a line at an x-value i want to predict
plt.plot([2.5,2.5],[-4,4],color='g',lw=2)

# does the red line look like a good prediction line for this plot?
# not really! we'd expect the red line to intersect at the mean of those points

In [None]:
plt.scatter(standard_midpartent_heights,standard_child_heights)
plt.xlim(-4,4)
plt.ylim(-4,4)
plt.plot([-4,4],[-4,4], color='r',lw=2)
plt.plot([2.5,2.5],[-4,4],color='g',lw=2)

# draw a line with slope = r!
r = correlation_coeff(midparent_heights,child_heights)
plt.plot([-4,4],[-4*r,4*r], color='b',lw=2)

# that's our linear regression line!

In [None]:
# we can use this to make predictions _in standard units_
# how do we get it back to inches?

# compute the mean and standard deviation of both variables
parent_mean = np.mean(midparent_heights)
parent_sd = np.std(midparent_heights)
child_mean = np.mean(child_heights)
child_sd = np.std(child_heights)


def predict_with_regression(parent):
    # normalize the parent height 
    parent_su = (parent - parent_mean) / parent_sd
    
    # predict the child height in standard units using the equation of our regression line 
    child_su = r * parent_su
    
    # convert back into "real" units
    predicted_height = child_su * child_sd + child_mean
    return predicted_height

predict_with_regression(68)