In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

# Lecture 25 #

## Average (Mean) ##
There are several equivalent methods for calculating the mean of some numbers, i.e., the sum of the values divided by how many values.

In [None]:
values = make_array(2, 3, 3, 9)
values

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

1 of 4 values is 2, 2 of four values is 3, and 1 of four values is 9:

In [None]:
2*(1/4) + 3*(2/4) + 9*(1/4)

25% of the values are 2, 50% of the values are 3, and 25% of the values are 9:

In [None]:
2*0.25 + 3*0.5 + 9*0.25

Let's visualize our distribution.

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)  # bins centered at 1, 2, 3, ...
values_table.hist('value', bins = bins_for_display)

In [None]:
## Consider an array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

**Question**: How will the distribution for this new array differ from the previous array when we show it in a histogram (y-axis scaled as percent per unit)?

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

In [None]:
# We also get the same average!
print(np.average(values))
print(np.average(new_vals))

**Back to slides...**

## Average as Center of Mass

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as Center of Mass');

**Back to slides...**

## Standard Deviation ##
We need a way to quantify the variability of a distribution.

In [None]:
# We have some values
values

In [None]:
# We put them in a table
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
# We find their average
average_value = np.mean(values)
average_value

In [None]:
# We find the deviations from the mean:
# deviations = values - average_value
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
# An interesting property: the sum of the deviations is always...
print("Sum of deviations is:", sum(deviations))

**Since the average deviation is always zero, "average deviation" is not a good way to measure variability!**

Maybe we could square each deviation. None of the squared deviations are negative, so they will only add up to zero if literally every value is the same as the average.

In [None]:
sd_table = sd_table.with_column('Squared Deviation', deviations**2)
sd_table.show()
print("Sum of squared deviations is:", sum(deviations**2))

In [None]:
# DEFINITION: Variance of the data is defined to be the
# mean squared deviation from average
variance = np.mean(deviations ** 2)
variance

In [None]:
# DEFINITION: Standard Deviation (SD) is defined to be the square root of the variance,
# that is, 
# SD = root mean squared deviation from average
sd = np.sqrt(variance)
sd

In [None]:
# Numpy provides a function for this, np.std()
np.std(values)

**Back to Slides...**

## Chebyshev's Bounds ##
Let's check this out in the context of some real data.

In [None]:
# Consider the births data without the Maternal Smoker column
births = Table.read_table('baby.csv').drop('Maternal Smoker')

In [None]:
births.labels

In [None]:
# Generate a sequence of histograms for these numerical variables
births.hist(overlay = False)

In [None]:
# Maternal Pregnancy Weight is clearly skewed to the right
# Will that defeat Chebychev?
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

Chebychev's inequality predicts that **at least** 88.89% of the data will fall within 3 SD of the mean. Is it true?

In [None]:
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# Proportion within 3 SDs of the mean
within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# This proportion should be at least

1 - 1/3**2

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%')

In [None]:
# Recall Chebychev's bounds:
print("Chebychev's Lower Bounds:")
for z in [2, 3, 4, 5]:
    bound = 1 - 1 / z**2
    print("Average plus or minus", z, "SDs:", round(bound*100, 2), "%")

**Back to slides...**

## Standard Units ##

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')
ages

In [None]:
ages_standard_units = standard_units(ages)
ages_standard_units

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)


**Back to Slides...**

## Discussion Question 

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);