# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [1]:
import pandas as pd
from statadict import parse_stata_dict
import time
import numpy as np
import pickle
from collections import defaultdict

C:\Users\Arjun Janamatti\Anaconda3\envs\tf_nptel\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\Arjun Janamatti\Anaconda3\envs\tf_nptel\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [2]:
def GetDataframe():
    try:
        with open('preg_df.pickle', 'rb') as read_file:
            preg_df = pickle.load(read_file)
    except:

        stata_dict = parse_stata_dict(file="2002FemPreg.dct")
        data = pd.read_fwf("2002FemPreg.dat", names=stata_dict.names, colspecs=stata_dict.colspecs)
        data['prglength'] = (data.apply(lambda x:x['wksgest'] if x['wksgest'] else x['mosgest']*4.33, axis=1))

        preg_df = data[['caseid', 'prglength', 'outcome', 'pregordr',
                                        'birthord', 'birthwgt_lb', 'birthwgt_oz',
                                        'agepreg', 'finalwgt']]

        with open('preg_df.pickle', 'wb') as file:
            pickle.dump(obj=preg_df, file=file)
            
    return preg_df


In [3]:
# read the dataframe
preg_df = GetDataframe()
preg_df.head()

Unnamed: 0,caseid,prglength,outcome,pregordr,birthord,birthwgt_lb,birthwgt_oz,agepreg,finalwgt
0,1,39.0,1,1,1.0,8.0,13.0,3316.0,6448.271112
1,1,39.0,1,2,2.0,7.0,14.0,3925.0,6448.271112
2,2,39.0,1,1,1.0,9.0,2.0,1433.0,12999.542264
3,2,39.0,1,2,2.0,7.0,0.0,1783.0,12999.542264
4,2,39.0,1,3,3.0,6.0,3.0,1833.0,12999.542264


In [4]:
def data_cleaning():
    preg_df['agepreg'] = preg_df['agepreg'].apply(lambda x: x / 100)
    na_vals = [97, 98, 99]
    preg_df.loc[preg_df['birthwgt_lb'] > 20, 'birthwgt_lb'] = np.nan
    preg_df['birthwgt_lb'] = preg_df['birthwgt_lb'].apply(lambda x:x if x not in na_vals else np.nan)
    preg_df['birthwgt_oz'] = preg_df['birthwgt_oz'].apply(lambda x:x if x not in na_vals else np.nan)
    preg_df['totalwgt_lb'] = preg_df.apply(lambda x: x['birthwgt_lb'] + (x['birthwgt_oz']/16.0), axis=1)
    preg_df['totalwgt_kg'] = preg_df['totalwgt_lb'].apply(lambda x: x * 0.453592)
    return preg_df

In [5]:
# cleaning the dataframe
preg_df = data_cleaning()
preg_df.head()

Unnamed: 0,caseid,prglength,outcome,pregordr,birthord,birthwgt_lb,birthwgt_oz,agepreg,finalwgt,totalwgt_lb,totalwgt_kg
0,1,39.0,1,1,1.0,8.0,13.0,33.16,6448.271112,8.8125,3.997279
1,1,39.0,1,2,2.0,7.0,14.0,39.25,6448.271112,7.875,3.572037
2,2,39.0,1,1,1.0,9.0,2.0,14.33,12999.542264,9.125,4.139027
3,2,39.0,1,2,2.0,7.0,0.0,17.83,12999.542264,7.0,3.175144
4,2,39.0,1,3,3.0,6.0,3.0,18.33,12999.542264,6.1875,2.806601


Count the number of times each value occurs.

In [6]:
preg_df.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [7]:
preg_df.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [8]:
def get_caseid_dict():
    e = defaultdict(list)
    {e[caseid].append(index) for index, caseid in preg_df['caseid'].iteritems()}
    return e

In [9]:
caseid = 10229
preg_map = get_caseid_dict()
indices = preg_map[caseid]
preg_df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [10]:
# Solution goes here
preg_df.birthord.value_counts()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [11]:
preg_df.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [12]:
# Solution goes here
preg_df.prglength.value_counts().sort_index(ascending=False)

97.0       2
50.0       2
48.0       7
47.0       1
46.0       2
45.0      10
44.0      46
43.0     149
42.0     330
41.0     593
40.0    1122
39.0    4755
38.0     574
37.0     457
36.0     328
35.0     288
34.0      58
33.0      48
32.0     122
31.0      27
30.0     152
29.0      23
28.0      37
27.0       8
26.0      86
25.0       6
24.0      31
23.0      12
22.0     100
21.0      17
20.0      18
19.0      34
18.0      17
17.0     207
16.0      45
15.0      39
14.0      29
13.0     406
12.0     159
11.0     202
10.0     137
9.0      539
8.0      319
7.0      177
6.0      548
5.0      175
4.0      370
3.0      150
2.0       73
1.0        8
0.0        6
Name: prglength, dtype: int64

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [13]:
print(f"Mean weight of baby in lb: {round(preg_df['totalwgt_lb'].mean(), 3)}")

Mean weight of baby in lb: 7.266


Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [14]:
# Solution goes here
print(f"Mean weight of baby in kg: {round(preg_df['totalwgt_kg'].mean(), 3)}")

Mean weight of baby in kg: 3.296


`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [15]:
def FemRespDataframe():
    try:
        with open('fem_resp_df.pickle', 'rb') as read_file:
            fem_resp_df = pickle.load(read_file)
    except:

        stata_dict = parse_stata_dict(file="2002FemResp.dct")
        fem_resp_df = pd.read_fwf("2002FemResp.dat", names=stata_dict.names, colspecs=stata_dict.colspecs)
        with open('fem_resp_df.pickle', 'wb') as file:
            pickle.dump(obj=fem_resp_df, file=file)
            
    return fem_resp_df

In [16]:
resp = FemRespDataframe()

`DataFrame` provides a method `head` that displays the first five rows:

In [17]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [18]:
# Solution goes here
resp['age_r'].value_counts().sort_index()

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [19]:
resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [20]:
preg_df[preg_df.caseid==2298]

Unnamed: 0,caseid,prglength,outcome,pregordr,birthord,birthwgt_lb,birthwgt_oz,agepreg,finalwgt,totalwgt_lb,totalwgt_kg
2610,2298,40.0,1,1,1.0,6.0,14.0,18.08,5556.717241,6.875,3.118445
2611,2298,36.0,1,2,2.0,5.0,8.0,20.0,5556.717241,5.5,2.494756
2612,2298,30.0,1,3,3.0,4.0,3.0,21.41,5556.717241,4.1875,1.899417
2613,2298,40.0,1,4,4.0,6.0,14.0,24.66,5556.717241,6.875,3.118445


How old is the respondent with `caseid` 1?

In [21]:
# Solution goes here
check_respondent_id = 1
print(f"Age of caseid {check_respondent_id} is : {(resp[resp.caseid==check_respondent_id]['age_r'].values)[0]}")

Age of caseid 1 is : 44


What are the pregnancy lengths for the respondent with `caseid` 2298?

In [22]:
# Solution goes here
np.sort((preg_df[preg_df.caseid==2298]['prglength']).values)

array([30., 36., 40., 40.])

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [23]:
# Solution goes here
check_respondent_id = 5012
print(f"birthweight of caseid {check_respondent_id} is : {((preg_df[preg_df['caseid'] == 5012]['totalwgt_kg']).values)[0]}")

birthweight of caseid 5012 is : 2.721552
