In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

In [None]:
r_scatter(-.5)

# Lecture 29

## A. Approach to Prediction

In [None]:
# Recall the data on family heights
# Note: Child heights are the **adult** heights of children in a family
families = Table.read_table('family_heights.csv')
families

In [None]:
# Based on parent average, can we predict the child height?
# Average two arrays to get the array of parent averages
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
# Make a 2-column table
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
)
heights

In [None]:
# visualize the data in a scatter plot -- is there an association?
# do the data "move together"?
heights.scatter('Parent Average', 'Child')

Say a new baby is born and their parents' average height is 68 inches. Try to predict the adult height of the baby.

In [None]:
# Suppose a person's parents have average height 68
# Show vertical red lines at x = 67.5 and x = 28.5 to identify 
# data points with similar parental average
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
# Focus on parents of a similar height to our baby's parents.

# To predict a baby's adult height based on their parental average height being 68,
# we average all the child heights for data points in the strip of "nearby" data

# nearby data points have parental average within 0.5 inches of 68
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

The average outcome for similar parents is a child of height 67.62 inches. So, that's an estimate. But of course there's a huge amount of variability in the data, so it's not likely to be a good estimate. Still, it's a reasonable method and this about the best we can do.

In [None]:
# To the previous plot, add a big red dot at x = 68, y = nearby_mean
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

In [None]:
# Define a function to make this prediction for any parental average 
def predict_child(h):
    """Predict the height of a child whose parents have an average height of h.
    
    The prediction is the average height of the children whose parent average height is
    within 0.5 inches of h.
    """
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    return np.average(nearby.column('Child'))

# Test by calling the function with h = 68
predict_child(68)

In [None]:
# Compute all the predicted child heights for rows of the heights table
predictions = heights.apply(predict_child, 'Parent Average')

# Make a new table which includes the predictions
heights_with_predictions = heights.with_column('Prediction', predictions)
heights_with_predictions

In [None]:
# Visualize with a scatter plot
heights_with_predictions.scatter('Parent Average')

Cool, we have built a roughly-linear prediction model.

Questions about this old example from Lecture 10?

## B. Association
We have some new data here for hybrid cars. Well, new to us (it dates to 2015).

In [None]:
# Read the data on hybrid cars
hybrid = Table.read_table('hybrid.csv')

In [None]:
hybrid

 - `msrp` is the "suggested retail price" of the vehicle, with all prices normalized to 2013 dollars (to adjust for inflation).
  - `acceleration` is measured in km per hour per second; higher is better if you're trying to pass on a 2-lane highway
  - `mpg` is measuring fuel efficiency in miles per gallon. The whole point of hybrid vehicles is to burn less gas.
  - `class` indicates the type of car (SUV, Minivan, Compact, etc.)

In [None]:
# The data are for 153 hybrid vehicles produced from 1997 through 2013
hybrid.group('year').show()

In [None]:
# Sort to put the most expensive vehicles at the top
hybrid.sort('msrp', descending=True)

In [None]:
# Discuss the trend: The whole point of hybrid vehicles is to burn less gas. Why is 
# it that if you want the best mileage you should buy the cheapest hybrid?

# Is there a linear association here?
hybrid.scatter('mpg', 'msrp')

In [None]:
# Discuss the association (if any) between acceleration and msrp for these cars
hybrid.scatter('acceleration', 'msrp')

In [None]:
# It might be important to compare similar vehicles. Let's look at SUVs only.
# What do you observe?
# Is there an overall trend? A linear association?
suv = hybrid.where('class', 'SUV')
suv.num_rows

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
suv.scatter('mpg', 'msrp')

Notice that when discussing association in general, it tends to be independent of the units. We really just care about the shape of the plot.

Therefore, its reasonable to use **standard units** when trying to quanitify association.

In [None]:
def standard_units(x):
    "Convert an array x of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
# For SUVs, make a scatterplot of msrp versus mpg in standard units
# Notice that only the scales have changed; the pattern of the dots is the same
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

Question: Where do we find the most "average" data point for the previous scatterplot?

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
Table().with_columns(
    'acceleration (standard units)', standard_units(suv.column('acceleration')), 
    'msrp (standard units)',         standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

## C. Correlation

In [None]:
# Here's a scatterplot with a specific correlation
r_scatter(.25)

# Re-run for r = -1, r = 0, etc.

We use the **correlation coefficient** (r) to describe a linear association.

In [None]:
# Here are six made-up data points in a table `t`
x = make_array(1, 2, 3, 4, 5, 6)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
# Convert both columns to standard units
t = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
# Scatterplot in Standard Units
t.scatter(2, 3, s=30, color='red')

Correlation is the **average** of the **product of standard units** for each data point.

In [None]:
# Add a column showing product of standard units
t = ...
t

In [None]:
# By definition, r is the average of the products of the standard units

r = ...
r

Remember, r only measures **linear** association.

Why do we multiply?
  - In the lower-left and upper-right quadrant of our plot, the product is positive.
  - In the lower-right and upper-left quadrant of our plot, the product is negative.
  - The size of the product is larger when the point is "farther into the corner"
  - The average product tells us about the linear association.

In [None]:
# For any two-column table of numerical data, find the correlation coefficient (r)
def correlation(t, x, y):
    """t is a table; x and y are strings (column labels in t)"""
    x_in_standard_units = ...
    y_in_standard_units = ...
    r = ...
    return r

In [None]:
# Test the function on our existing two-column table `t`
correlation(t, 'x', 'y')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
correlation(suv, 'acceleration', 'msrp')

In [None]:
# Back to slides...






## D. Wrap-up

### Switching Axes

In [None]:
# Does correlation change if we swap the roles of x and y?
correlation(t, 'x', 'y')

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t.scatter('y', 'x', s=30, color='red')

In [None]:
correlation(t, 'y', 'x')

### Nonlinearity

In [None]:
# What is the correlation if we have a strong nonlinear pattern, like a parabola?

# Make a table where y is just x squared
new_x = np.arange(-4, 4.1, 0.5)
parabola = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
parabola.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(parabola, 'x', 'y')

Note: There IS an *association* between x and y in the scatterplot above. But r = 0 indicates that there is no **linear** association.

### Outliers

In [None]:
# These made-up data lie on the line y = x
# There are no outliers: all the points fit the same pattern
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
# They have a "perfect positive association"
correlation(line, 'x', 'y')

In [None]:
# What if we introduce an outlier; how will that affect the correlation?

# Add a fifth point which is an extreme outlier: (x, y) = (5, 0)
line_with_outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
line_with_outlier.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(line_with_outlier, 'x', 'y')

Takeaways:
  - Correlation is unaffected by which variable is "x" and which is "y"
  - Correlation is unaffected by the units
  - Correlation does not measure a nonlinear association. Always plot the data first and don't use correlation to learn about that data if it has a strongly nonlinear pattern.
  - Correlation is sensitive to extreme outliers. If extreme outliers are present in the data, the correlation may not be an accurate measure of the overall association.