# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [1]:
from __future__ import print_function, division

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [2]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [3]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [4]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [5]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [6]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [7]:
pregordr[0]

1

Select a slice from a column.

In [8]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [9]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [10]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [11]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [12]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [13]:
# Solution goes here
birthord= preg['birthord']

In [14]:
birthord

0        1.0
1        2.0
2        1.0
3        2.0
4        3.0
5        1.0
6        2.0
7        3.0
8        1.0
9        2.0
10       1.0
11       1.0
12       2.0
13       NaN
14       NaN
15       1.0
16       2.0
17       1.0
18       NaN
19       1.0
20       2.0
21       1.0
22       NaN
23       1.0
24       2.0
25       3.0
26       1.0
27       1.0
28       2.0
29       3.0
        ... 
13563    2.0
13564    3.0
13565    1.0
13566    1.0
13567    NaN
13568    NaN
13569    1.0
13570    2.0
13571    3.0
13572    4.0
13573    1.0
13574    2.0
13575    NaN
13576    1.0
13577    NaN
13578    1.0
13579    2.0
13580    NaN
13581    1.0
13582    NaN
13583    NaN
13584    1.0
13585    NaN
13586    NaN
13587    NaN
13588    1.0
13589    NaN
13590    NaN
13591    2.0
13592    3.0
Name: birthord, Length: 13593, dtype: float64

In [15]:
birthord.value_counts(), birthord.value_counts().sum()

(1.0     4413
 2.0     2874
 3.0     1234
 4.0      421
 5.0      126
 6.0       50
 7.0       20
 8.0        7
 9.0        2
 10.0       1
 Name: birthord, dtype: int64, 9148)

We can also use `isnull` to count the number of nans.

In [16]:
preg.birthord.isnull().sum()

4445

In [17]:
preg.birthord.isnull().sum() + birthord.value_counts().sum()

13593

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [18]:
# Solution goes here
prgCounts = preg['prglngth'].value_counts()
prgCounts

39    4744
40    1120
38     609
9      594
41     591
6      543
37     457
13     446
4      412
8      409
35     357
36     329
42     328
17     253
11     202
30     198
5      181
7      175
12     170
3      151
43     148
22     147
10     137
32     122
26     117
2       78
34      60
33      50
44      46
16      44
15      39
28      38
21      37
19      34
24      31
31      29
14      29
29      23
20      18
18      17
0       15
25      15
23      12
45      10
1        9
27       8
48       7
50       2
46       1
47       1
Name: prglngth, dtype: int64

In [51]:
men13 = (preg['prglngth']<=13).value_counts()[True]
men18 = (preg['prglngth']<=26).value_counts()[True]
alles = prgCounts.sum()
men13, men18- men13, alles- men18

(3522, 793, 9278)

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [20]:
preg.totalwgt_lb.mean()

7.265628457623368

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [21]:
# Solution goes here
preg['totalwgt_kg'] = preg.totalwgt_lb*0.456
preg['totalwgt_kg'], preg['totalwgt_kg'].mean()

(0        4.0185
 1        3.5910
 2        4.1610
 3        3.1920
 4        2.8215
 5        3.9045
 6        4.3605
 7        3.8190
 8        3.4485
 9        3.0210
 10       3.5625
 11       3.1920
 12       1.8240
 13          NaN
 14          NaN
 15       3.5055
 16       3.4200
 17       2.8785
 18          NaN
 19       3.9900
 20       3.7335
 21       2.5365
 22          NaN
 23       3.0780
 24       3.3630
 25       3.1065
 26       3.7050
 27       3.2490
 28       2.7645
 29       3.3915
           ...  
 13563    3.5055
 13564    3.4770
 13565    3.7050
 13566    3.4200
 13567       NaN
 13568       NaN
 13569    2.6505
 13570    3.0495
 13571    2.7360
 13572    2.6505
 13573    2.9925
 13574    2.7930
 13575       NaN
 13576    2.9355
 13577       NaN
 13578    2.7360
 13579    3.1920
 13580       NaN
 13581    2.9070
 13582       NaN
 13583       NaN
 13584    2.9070
 13585       NaN
 13586       NaN
 13587       NaN
 13588    2.8215
 13589       NaN
 13590       N

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [22]:
resp = nsfg.ReadFemResp()
resp

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.799490,4744.191350,2,18,1233,1221,16:30:59,64.294000
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.799490,4744.191350,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667
5,845,1,5,4,1,5.0,42,42,727,42,...,0,2335.279149,3725.796795,4705.681352,2,18,1234,1222,17:10:13,95.488000
6,10333,5,5,3,1,5.0,17,17,1029,17,...,0,2335.279149,2687.399758,3139.151658,2,18,1236,1224,14:14:38,61.204333
7,855,5,5,4,5,5.0,22,22,965,22,...,0,4670.558298,7122.614751,10019.382170,2,18,1235,1223,14:42:52,59.756333
8,8656,5,5,4,1,5.0,38,38,780,38,...,0,5198.652195,6027.568848,6520.021223,2,18,1237,1225,15:32:34,56.978833
9,3566,5,5,4,5,5.0,21,21,974,21,...,0,2764.142038,3240.986558,4559.095792,2,18,1231,1219,16:22:25,104.744667


`DataFrame` provides a method `head` that displays the first five rows:

In [23]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [36]:
# Solution goes here
# age_r = resp['age_r'], age_r.min(), age_r.max()
ageR = resp['age_r']
ageR, ageR.min(), ageR.max() 

(0       27
 1       42
 2       43
 3       15
 4       20
 5       42
 6       17
 7       22
 8       38
 9       21
 10      43
 11      26
 12      23
 13      34
 14      28
 15      28
 16      23
 17      33
 18      16
 19      24
 20      22
 21      32
 22      41
 23      37
 24      38
 25      29
 26      21
 27      37
 28      39
 29      26
         ..
 7613    18
 7614    24
 7615    15
 7616    30
 7617    24
 7618    34
 7619    34
 7620    26
 7621    22
 7622    19
 7623    19
 7624    37
 7625    20
 7626    23
 7627    23
 7628    17
 7629    36
 7630    44
 7631    32
 7632    40
 7633    35
 7634    35
 7635    30
 7636    41
 7637    35
 7638    34
 7639    17
 7640    29
 7641    16
 7642    28
 Name: age_r, Length: 7643, dtype: int64, 15, 44)

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [25]:
resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [26]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.135
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.508
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.9095
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.135


How old is the respondent with `caseid` 1?

In [38]:
# Solution goes here
resp[resp.caseid==2298]['age_r']

0    27
Name: age_r, dtype: int64

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [41]:
# Solution goes here
preg[preg.caseid==2298]['prglngth']

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [50]:
# Solution goes here
preg[preg.caseid==5012]['birthwgt_lb']

5515    6.0
Name: birthwgt_lb, dtype: float64