# Week 5 ( Aaron Kohn)

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT

## Exercise 5-1

In [1]:
import scipy.stats
import hinc
import nsfg
import math
import thinkstats2
import numpy as np

In [2]:
logmean = 178/2.54 # Lognormal mean of males in inches
logsd= 7.7/2.54 # Lognormal standard deviation in inches
BMlow = 70
BMhigh = 73
BMrange = scipy.stats.norm.cdf([BMlow, BMhigh],loc=logmean, scale=logsd)
print('The percentage of the population that is within the range is %',(BMrange[1] - BMrange[0])*100)
BMrange

The percentage of the population that is within the range is % 34.27468376314744


array([0.48963903, 0.83238587])

## Exercise 5-2

In [3]:
parheight = scipy.stats.pareto(b= 1.7, scale= 1) # Calculate Pareto distribution

parheight.mean() # Mean height in meters


2.428571428571429

In [4]:
parheight.cdf(parheight.mean()) # CDF of mean

0.778739697565288

In [5]:
(1 - parheight.cdf(1000)) * 7000000000 # In a population of 7 Billion amount projected to be over 1 km.

55602.976430479954

In [6]:
parheight.ppf(1 - 1/7000000000) # Height in meters of tallest indivdual.

618349.6106759505

## Exercise 6-1

In [7]:
# Copied from chapter 6 execise. Change made from iterrows to itertuples.
def InterpolateSample(df, log_upper=6.0):
    """Makes a sample of log10 household income.

    Assumes that log10 income is uniform in each range.

    df: DataFrame with columns income and freq
    log_upper: log10 of the assumed upper bound for the highest range

    returns: NumPy array of log10 household income
    """
    # compute the log10 of the upper bound for each range
    df['log_upper'] = np.log10(df.income)

    # get the lower bounds by shifting the upper bound and filling in
    # the first element
    df['log_lower'] = df.log_upper.shift(1)
    df.loc[0, 'log_lower'] = 3.0

    # plug in a value for the unknown upper bound of the highest range
    df.loc[41, 'log_upper'] = log_upper
    
    # use the freq column to generate the right number of values in
    # each range
    arrays = []
    for  row in df.itertuples():
        vals = np.linspace(row.log_lower, row.log_upper, row.freq)
        arrays.append(vals)

    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample

In [8]:
income_df = hinc.ReadData()
log_sample = InterpolateSample(income_df, log_upper=6.0)
sample = np.power(10, log_sample)

In [9]:
def RawMoment(xs, k):
    return sum(x**k for x in xs) / len(xs)
def Mean(xs):
    return RawMoment(xs, 1)
def CentralMoment(xs, k):
    mean = Mean(xs)
    return sum((x -mean)**k for x in xs)/ len(xs)
def Median(xs):
    CDF = thinkstats2.Cdf(xs)
    return CDF.Value(.5)
def StandardizedMoment(xs, k):
    var = CentralMoment(xs, 2)
    std = math.sqrt(var)
    return CentralMoment(xs, k) / std**k
def Skewness(xs):
    return StandardizedMoment(xs, 3)
def PearsonSkew(xs):
    mean = Mean(xs)
    median = Median(xs)
    var = CentralMoment(xs, 2)
    std = math.sqrt(var)
    ps = 3 * (mean - median) / std
    return ps
def Output(xs):
    print('Mean = ', Mean(xs))
    print('Median = ', Median(xs))
    print('Skew = ', Skewness(xs))
    print("Pearson's median skewness = ", PearsonSkew(xs))
    CDF = thinkstats2.Cdf(xs)
    print('Amount below mean =', CDF[Mean(xs)])

In [10]:
Output(sample)

Mean =  74278.70753118733
Median =  51226.45447894046
Skew =  4.949920244429583
Pearson's median skewness =  0.7361258019141782
Amount below mean = 0.660005879566872


All values are calculated with the assumption that the top earner is making 1 million dollars. Assuming a higher number will move the values up accordingly. The mean will remain the same, since the amount of responders dosen't change, and only the top earners are being adjusted. Below is the calculations for a top earner at 10 million.

In [11]:
log_sample7 = InterpolateSample(income_df, log_upper=7.0)
sample7 = np.power(10, log_sample7)
Output(sample7)

Mean =  124267.39722164685
Median =  51226.45447894046
Skew =  11.603690267537793
Pearson's median skewness =  0.39156450927742087
Amount below mean = 0.8565630665207663


The mean has gone up while the median has remained the same. The skew has increased. However, the Pearsons skew has gone down. It is important to note that increasing the amount of the top earner increases the difference in income between the largest earners, while the lower levels remain unchanged.