In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

## Average (Mean) ##

In [None]:
values = make_array(2, 3, 3, 9)

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

In [None]:
2*(1/4) + 3*(2/4) + 9*(1/4)

## Mean vs Median

In [None]:
top_values = make_array(1, 2, 2, 3, 3, 3, 4, 4, 10)
bottom_values = make_array(1, 2, 2, 3, 3, 3, 4, 4, 5)

values_table = Table().with_columns(
    'top_values', top_values,
    'bottom_values', bottom_values
)

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('top_values', 'bottom_values', bins = bins_for_display, overlay = False)

In [None]:
np.mean(top_values), np.median(top_values)

In [None]:
np.mean(bottom_values), np.median(bottom_values)

## Defining Variability

In [None]:
bottom_values_table = Table().with_columns(
    'bottom values', bottom_values,
    'mean of bottom values', np.repeat(np.mean(bottom_values), len(bottom_values)),
    'deviation from mean', bottom_values - np.mean(bottom_values)
)

bottom_values_table

In [None]:
sum(bottom_values_table.column('deviation from mean')) / bottom_values_table.num_rows

In [None]:
sum((bottom_values_table.column('deviation from mean')) ** 2 ) / bottom_values_table.num_rows

In [None]:
(sum((bottom_values_table.column('deviation from mean')) ** 2 ) / bottom_values_table.num_rows) ** 0.5

In [None]:
np.std(bottom_values)

In [None]:
np.std(top_values)

## Chebyshev's Bound

In [None]:
births = Table.read_table('baby.csv').drop('Maternal Smoker')
births.show(3)

In [None]:
births.hist(overlay = False)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# The proportion we calculated above should be at least

1 - 1/(3**2)

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for variable in births.labels:
    values = births.column(variable)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(variable)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(variable, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '% of the data')

## Standard Units ##

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')

In [None]:
ages_standard_units = standard_units(ages)

In [None]:
np.mean(ages), np.std(ages)

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
both.hist(overlay = False)

In [None]:
both = both.with_columns(
    'Birth Weight', births.column('Birth Weight'),
    'Birth Weight in Standard Units', standard_units(births.column('Birth Weight'))
)

both

In [None]:
both.sort('Birth Weight')

## The SD and Bell Shaped Curves

In [None]:
births.hist('Birth Weight', bins = np.arange(65, 190, 5))

In [None]:
birth_weights = births.column('Birth Weight')
np.mean(birth_weights), np.std(birth_weights)