# Chapter 1

Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [None]:
from os.path import basename, exists


def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local)


download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")

In [None]:
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py")

download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct")
download(
    "https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz"
)

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [4]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local)

download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py")

import nsfg

Downloaded thinkplot.py


In [6]:
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz")
preg = nsfg.ReadFemPreg()
preg.head()

Downloaded 2002FemPreg.dct
Downloaded 2002FemPreg.dat.gz


Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [7]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [8]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [9]:
pregordr = preg['pregordr']
type(pregordr)

Print a column.

In [10]:
pregordr

Unnamed: 0,pregordr
0,1
1,2
2,1
3,2
4,3
...,...
13588,1
13589,2
13590,3
13591,4


Select a single element from a column.

In [11]:
pregordr[0]

np.int64(1)

Select a slice from a column.

In [12]:
pregordr[2:5]

Unnamed: 0,pregordr
2,1
3,2
4,3


Select a column using dot notation.

In [13]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [14]:
preg.outcome.value_counts().sort_index()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
1,9148
2,1862
3,120
4,1921
5,190
6,352


Check the values of another variable.

In [15]:
preg.birthwgt_lb.value_counts().sort_index()

Unnamed: 0_level_0,count
birthwgt_lb,Unnamed: 1_level_1
0.0,8
1.0,40
2.0,53
3.0,98
4.0,229
5.0,697
6.0,2223
7.0,3049
8.0,1889
9.0,623


Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [16]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf)

We can also use `isnull` to count the number of nans.

In [17]:
preg.birthord.isnull().sum()

np.int64(4445)

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf)

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [18]:
preg.totalwgt_lb.mean()

np.float64(7.265628457623368)

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [None]:
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dct")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dat.gz")

In [None]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [None]:
resp.head()

Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [None]:
resp[resp.caseid==2298]

And we can get the corresponding rows from `preg` like this:

In [None]:
preg[preg.caseid==2298]

How old is the respondent with `caseid` 1?

In [20]:
preg.birthord.value_counts()[3]

np.int64(1234)

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [26]:
max_birthord_idx = preg['birthord'].idxmax()
caseid_max_birthord = preg.loc[max_birthord_idx, 'caseid']
max_val = preg.loc[max_birthord_idx, 'birthord']

print(f"Case ID with the highest birth order: {caseid_max_birthord}")
print(f"Highest birth order value: {max_val}")

Case ID with the highest birth order: 1169
Highest birth order value: 10.0


In [25]:
mean_wgt_by_ord = preg.groupby('birthord')['totalwgt_lb'].mean()
lowest_ord = mean_wgt_by_ord.idxmin()
print(f"Birth order with lowest average weight: {int(lowest_ord)}")
print(mean_wgt_by_ord.sort_values().head())

Birth order with lowest average weight: 8
birthord
8.0    6.151786
7.0    6.598684
5.0    7.031754
1.0    7.201094
4.0    7.236531
Name: totalwgt_lb, dtype: float64


In [24]:
mean_weight_firstborns = preg[preg.birthord == 1].totalwgt_kg.mean()
print(round(mean_weight_firstborns, 2))

3.27


In [23]:
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dct")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dat.gz")

resp = nsfg.ReadFemResp()

# Filter for the first 100 cases (caseid <= 100)
resp_subset = resp[resp.caseid <= 100]

# Calculate mean age
mean_age_first_100 = resp_subset.age_r.mean()

print(f"Average Age (First 100 cases): {round(mean_age_first_100, 1)}")

Downloaded 2002FemResp.dct
Downloaded 2002FemResp.dat.gz
Average Age (First 100 cases): 30.2


In [22]:
preg['totalwgt_kg'] = preg['totalwgt_lb'] / 2.205
mean_kg = preg['totalwgt_kg'].mean()
median_kg = preg['totalwgt_kg'].median()
difference = abs(median_kg - mean_kg)

print(f"Mean (kg): {mean_kg}")
print(f"Median (kg): {median_kg}")
print(f"Absolute Difference: {round(difference, 2)}")

Mean (kg): 3.2950695952940445
Median (kg): 3.3446712018140587
Absolute Difference: 0.05


In [21]:
mean_prglngth = preg.prglngth.mean()
print(round(mean_prglngth, 1))

29.5


What was the birthweight of the first baby born to the respondent with `caseid` 5013?