In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

## Prediction ##

In [None]:
HighPeaks = Table.read_table('HighPeaks.csv')
HighPeaks

In [None]:
HighPeaks.scatter('Ascent', 'Time')

In [None]:
HighPeaks.scatter('Ascent', 'Time')

# My guess about a line that may describe the trend

plots.plot([1800,4500], [5.5,16.5], color='red', lw=2);

## Association ##

In [None]:
HighPeaks.scatter('Length', 'Time')

In [None]:
HighPeaks.scatter('Length', 'Ascent')

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'Ascent (standard units)',  standard_units(HighPeaks.column('Ascent')), 
    'Time (standard units)', standard_units(HighPeaks.column('Time'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
Table().with_columns(
    'Length (standard units)',  standard_units(HighPeaks.column('Length')), 
    'Time (standard units)', standard_units(HighPeaks.column('Time'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

## Correlation ##

In [None]:
r_scatter(1)

In [None]:
r_scatter(0.66)

In [None]:
r_scatter(0.33)

In [None]:
r_scatter(0)

In [None]:
r_scatter(-0.33)

In [None]:
r_scatter(-0.66)

In [None]:
r_scatter(-1)

## Calculating $r$ ##

In [None]:
HighPeaks_AT = HighPeaks.select('Ascent', 'Time')
HighPeaks_AT

In [None]:
HighPeaks_AT.scatter('Ascent')

In [None]:
HighPeaks_AT = HighPeaks_AT.with_columns(
    'Ascent (standard units)',  standard_units(HighPeaks.column('Ascent')), 
    'Time (standard units)', standard_units(HighPeaks.column('Time'))
)

HighPeaks_AT

In [None]:
HighPeaks_AT = HighPeaks_AT.with_columns(
    'product of standard units', HighPeaks_AT.column(2) * HighPeaks_AT.column(3))
HighPeaks_AT

In [None]:
# r is the average of the products of the standard units

r = np.average(HighPeaks_AT.column(2) * HighPeaks_AT.column(3))
r

## Function for Calculating $r$ ##

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

In [None]:
correlation(HighPeaks, 'Ascent', 'Time')

In [None]:
correlation(HighPeaks, 'Length', 'Time')

In [None]:
correlation(HighPeaks, 'Length', 'Ascent')

### Switching Axes ###

In [None]:
correlation(HighPeaks, 'Ascent', 'Time'), correlation(HighPeaks, 'Time', 'Ascent')

### Nonlinearity ###

In [None]:
Tortoises = Table.read_table('Tortoises.csv')
Tortoises

In [None]:
correlation(Tortoises, 'CarapaceLength', 'NumEggs')

In [None]:
Tortoises.scatter('CarapaceLength', 'NumEggs', s=30, color='r')

### Outliers ###

In [None]:
PalmBeach = Table.read_table('PalmBeach.csv')
PalmBeach

In [None]:
correlation(PalmBeach, 'Buchanan', 'Bush')

In [None]:
PalmBeach.scatter('Bush', 'Buchanan')

In [None]:
PalmBeach_removed = PalmBeach.where('County', are.not_containing('PALM BEACH'))

In [None]:
PalmBeach_removed.scatter('Bush', 'Buchanan')

In [None]:
correlation(PalmBeach_removed, 'Buchanan', 'Bush')