In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from scipy.stats import norm
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

# Lecture 26

## Average (Mean) 

In [None]:
values = make_array(2, 3, 3, 10)

In [None]:
# use np.average() 
np.average(values)

In [None]:
# use np.mean() 
np.mean(values)

In [None]:
# calculate manually
(2 + 3 + 3 + 4)/4

In [None]:
# calculate manually 2
2*(1/4) + 3*(1/4) + 3*(1/4) + 4*(1/4)

In [None]:
# use np.median() to calculate median 
np.median(values)

In [None]:
# use percentile() to calculate median
percentile(50, values)

In [None]:
# load nba player data
nba = Table.read_table('nba2013.csv')
nba

In [None]:
# visualize the heights of NBA players
nba.hist('Height', bins=np.arange(65.5, 90.5))

In [None]:
# calculate the median
heights = nba.column('Height')
percentile(50, heights)
np.median(heights)

In [None]:
# calculate the mean
np.average(heights)

## Chebyshev's Bounds ##

In [None]:
# get information on baby births
births = Table.read_table('baby.csv')
births

In [None]:
# get the mean and sd of Maternal Pregnancy Weight
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
# get the points that are within +/- 3 SD
within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# calculate the proportion of points that are within +/- 3 SD
within_3_SDs.num_rows/births.num_rows

In [None]:
# calculate Chebyshev's bounds
1 - 1/3**2 

In [None]:
# See if Chebyshev's bounds work for different distributions

for k in births.labels:
    values = births.column(k)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(k)
    for z in np.arange(2, 6):
        chosen = births.where(k, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows/births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, "%   Chebyshev's bound:", np.round(100 * (1 - 1/z**2)), 2)
        

## Standard Units ##

In [None]:
# Load LeBron James' statistics and the league statistics
bb_stats = Table.read_table('LeBron.csv')
bb_stats

In [None]:
# calculate z-scores
zscores = (bb_stats.column("LeBron") - bb_stats.column("League mean"))/bb_stats.column("League SD")
bb_stats.with_column("Z-scores", zscores)

In [None]:
# define a function that can calculate z-scoes for all values in an array
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x))/np.std(x)

In [None]:
# get the mother's ages
ages = births.column('Maternal Age')

In [None]:
# calculate z-scores for the mother's ages
ages_standard_units = standard_units(ages)
ages_standard_units

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
# display z-scores for each age
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
# original data
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
# z-score only changes the units, not the shape of the distribution
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The normal curve ##

In [None]:
# The normal curve
x = np.arange(-4, 4, 0.001)
y = norm.pdf(x, 0, 1)

plots.plot(x, y)
plots.title("The Standard Normal Curve")
plots.xticks(np.arange(-4, 5, 1));

### Data is often normal

In [None]:
# heights are normally distributed
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1), density=True)

In [None]:
# weights are also normally distributed
births.hist('Birth Weight')

In [None]:
# get the average and sd of weights
weights = births.column('Birth Weight')
mean_weight = np.mean(weights)
sd_weight = np.std(weights)
mean_weight, sd_weight

In [None]:
# calculate the values of mean +/- 2 * SD
(mean_weight - 2 * sd_weight, mean_weight + 2 * sd_weight)

In [None]:
# calculate percentiles for the middle 95% of the data
percentile(2.5, bw), percentile(97.5, bw)

## Central Limit Theorem and Simulating Sample Mean ##

In [None]:
# get the data set of flight delays
united = Table.read_table('united_summer2015.csv')
united

In [None]:
# visualize the data (which is not normal!)
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [None]:
# calculate the mean and sd of the delays
delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)

mean_delay, sd_delay

In [None]:
# Take random sample from population of size sample_size 
# Repeat to get empirical distribution of sample average
sample_size = 400

means = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_mean = np.mean(sampled_flights.column('Delay'))
    means = np.append(means, sample_mean)

In [None]:
# visualize the distribution of sample means
Table().with_columns('Sample Mean', means).hist(bins = 20)
plots.title('Sample Means: Sample Size ' + str(sample_size))
plots.xlabel('Random Sample Mean');

In [None]:
# look at the mean, of our means array
np.mean(means), np.std(means)

In [None]:
# look at the mean of the original data
mean_delay, sd_delay

In [None]:
# look at the original SD divided by square root of n
sd_delay/np.sqrt(sample_size)