In [1]:
"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com

Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

import sys
import numpy as np
import thinkstats2

from collections import defaultdict


def ReadFemResp(dct_file='2002FemResp.dct',
                dat_file='2002FemResp.dat.gz',
                nrows=None):
    """Reads the NSFG respondent data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
    CleanFemResp(df)
    return df


def CleanFemResp(df):
    """Recodes variables from the respondent frame.

    df: DataFrame
    """
    pass


def ReadFemPreg(dct_file='2002FemPreg.dct',
                dat_file='2002FemPreg.dat.gz'):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    CleanFemPreg(df)
    return df


def CleanFemPreg(df):
    """Recodes variables from the pregnancy frame.

    df: DataFrame
    """
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0

    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
    df.hpagelb.replace(na_vals, np.nan, inplace=True)

    df.babysex.replace([7, 9], np.nan, inplace=True)
    df.nbrnaliv.replace([9], np.nan, inplace=True)

    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.cmintvw = np.nan


def ValidatePregnum(resp, preg):
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    preg: pregnancy DataFrame
    """
    # make the map from caseid to list of pregnancy indices
    preg_map = MakePregMap(preg)
    
    # iterate through the respondent pregnum series
    for index, pregnum in resp.pregnum.iteritems():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), pregnum)
            return False

    return True


def MakePregMap(df):
    """Make a map from caseid to list of preg indices.

    df: DataFrame

    returns: dict that maps from caseid to list of indices into `preg`
    """
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d


def main():
    """Tests the functions in this module.

    script: string script name
    """
    # read and validate the respondent file
    resp = ReadFemResp()

    assert(len(resp) == 7643)
    assert(resp.pregnum.value_counts()[1] == 1267)

    # read and validate the pregnancy file
    preg = ReadFemPreg()
    print(preg.shape)

    assert len(preg) == 13593
    assert preg.caseid[13592] == 12571
    assert preg.pregordr.value_counts()[1] == 5033
    assert preg.nbrnaliv.value_counts()[1] == 8981
    assert preg.babysex.value_counts()[1] == 4641
    assert preg.birthwgt_lb.value_counts()[7] == 3049
    assert preg.birthwgt_oz.value_counts()[0] == 1037
    assert preg.prglngth.value_counts()[39] == 4744
    assert preg.outcome.value_counts()[1] == 9148
    assert preg.birthord.value_counts()[1] == 4413
    assert preg.agepreg.value_counts()[22.75] == 100
    assert preg.totalwgt_lb.value_counts()[7.5] == 302

    weights = preg.finalwgt.value_counts()
    key = max(weights.keys())
    assert preg.finalwgt.value_counts()[key] == 6

    # validate that the pregnum column in `resp` matches the number
    # of entries in `preg`
    assert(ValidatePregnum(resp, preg))

    
    print('All tests passed.')


if __name__ == '__main__':
    main()


(13593, 244)
All tests passed.


In [2]:
# Displaying the female pregnancy data from 2002-2003

import nsfg
df = nsfg.ReadFemPreg()
df

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.8750
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,9.1250
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,7.0000
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,6.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13588,12571,1,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,6.1875
13589,12571,2,,,,,3.0,,,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,
13590,12571,3,,,,,3.0,,,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,
13591,12571,4,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,7.5000


In [3]:
# Let's take a look at the columns in this dataframe

df.columns

# (length=244) --> 244 variables in this dataset

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [4]:
df.describe()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
count,13593.0,13593.0,352.0,349.0,352.0,3.0,13241.0,18.0,9144.0,163.0,...,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,0.0,9038.0
mean,6216.526595,2.34915,15.144886,1.34384,4.647727,3.666667,4.650177,4.055556,1.022419,1.834356,...,0.000809,0.003016,0.0,4216.271164,5383.982581,8196.42228,1.48731,44.083352,,7.265628
std,3645.417341,1.577807,13.922211,0.47567,2.527523,4.618802,1.84979,1.696787,0.190098,1.630208,...,0.028437,0.058727,0.0,3982.680473,5640.499431,9325.918114,0.499857,24.110403,,1.408293
min,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,64.577101,71.201194,118.65679,1.0,1.0,,0.125
25%,3022.0,1.0,5.0,1.0,2.0,1.0,3.0,3.0,1.0,1.0,...,0.0,0.0,0.0,2335.445237,2798.048902,3841.375308,1.0,25.0,,6.5
50%,6161.0,2.0,9.0,1.0,5.0,1.0,6.0,4.0,1.0,1.0,...,0.0,0.0,0.0,3409.648504,4127.220642,6256.592133,1.0,45.0,,7.375
75%,9423.0,3.0,23.0,2.0,7.0,5.0,6.0,6.0,1.0,1.0,...,0.0,0.0,0.0,4869.941451,5795.69288,9432.360931,2.0,65.0,,8.125
max,12571.0,19.0,99.0,2.0,9.0,9.0,9.0,6.0,5.0,5.0,...,1.0,2.0,0.0,99707.832014,157143.686687,261879.953864,2.0,84.0,,15.4375


In [5]:
# Women in this dataset who gave birth to 10 children

df[df["pregordr"] == 10]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
432,413,10,,,,,9.0,,,,...,0,0,0,4169.859401,8184.27106,10640.370805,1,77,,
655,601,10,,,,,8.0,,,,...,0,0,0,3409.593178,4268.065313,5548.911714,1,38,,
1318,1169,10,,,,,6.0,,1.0,,...,0,0,0,1167.622501,1528.814908,1940.219,1,44,,9.6875
1788,1597,10,,,,,3.0,,,,...,0,0,0,1806.307083,1940.127711,2969.365461,1,44,,
2494,2195,10,,,,,3.0,,,,...,0,0,0,1167.677859,1245.288219,1618.999214,1,75,,
2578,2265,10,,,,,1.0,,,,...,0,0,0,3878.375586,4129.398979,4466.770875,2,53,,
3026,2678,10,,,,,1.0,,,,...,0,0,0,5116.203608,9285.537825,11328.356146,1,7,,
3971,3523,10,,,,,3.0,,,,...,0,0,0,1167.505904,1269.103795,1711.132646,1,28,,
4753,4246,10,,,,,3.0,,,,...,0,0,0,1805.998152,2116.891507,2854.204819,1,46,,
5747,5268,10,,,,,6.0,,1.0,,...,0,0,0,2558.033914,2728.627612,3547.488758,1,44,,10.5


In [6]:
# Looking at specific birth data

df[["caseid", "prglngth", "outcome", "birthord"]]

Unnamed: 0,caseid,prglngth,outcome,birthord
0,1,39,1,1.0
1,1,39,1,2.0
2,2,39,1,1.0
3,2,39,1,2.0
4,2,39,1,3.0
...,...,...,...,...
13588,12571,39,1,1.0
13589,12571,6,2,
13590,12571,5,2,
13591,12571,39,1,2.0


In [7]:
# caseid 9786 had 19 births. Intriguing

df.loc[df["caseid"] == 9786, ["caseid", "agepreg", "prglngth", "outcome", "birthord"]]

# She had 19 births and 4 babies.

Unnamed: 0,caseid,agepreg,prglngth,outcome,birthord
10602,9786,15.5,12,2,
10603,9786,16.41,41,1,1.0
10604,9786,16.83,5,4,
10605,9786,17.33,5,4,
10606,9786,19.41,39,1,2.0
10607,9786,19.58,5,4,
10608,9786,20.83,5,4,
10609,9786,21.08,17,4,
10610,9786,22.58,2,4,
10611,9786,24.83,42,1,3.0


## Data Cleaning

In [16]:
# Let's take a look at agepreg values that are null.
# These are records that were assigned 97, 98, 99 for not wanting to share info

df.loc[df['agepreg'].isnull()]

# Special values encoded as numbers are dangerous. They can produce bogus results.
# Therefore, we should REPLACE them with NaN values

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
202,225,1,4.0,1.0,1.0,,,,,,...,0,0,0,2148.748487,2689.603580,3580.669246,1,44,,
283,285,2,9.0,1.0,2.0,,,,,,...,0,0,0,6562.704646,8648.584060,14546.053530,2,65,,
295,292,4,8.0,1.0,2.0,,,,,,...,0,0,0,2817.008202,3357.492213,4365.075626,1,54,,
307,302,2,22.0,1.0,5.0,,,,,,...,0,0,0,4870.737797,5152.472057,8939.539019,1,74,,
453,427,2,29.0,1.0,7.0,,,,,,...,0,0,0,4596.926139,5285.561481,7435.199336,1,30,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13425,12407,2,27.0,1.0,6.0,,,,,,...,0,0,0,1833.295397,2078.242296,2766.763968,2,2,,
13440,12415,3,8.0,1.0,2.0,,,,,,...,0,0,0,2580.625342,3002.831758,5050.462734,2,82,,
13441,12416,1,22.0,1.0,5.0,,,,,,...,0,0,0,3454.927434,3723.198051,6459.748618,2,78,,
13530,12517,1,25.0,1.0,6.0,,,,,,...,0,0,0,2734.661252,3244.480863,5629.174298,1,69,,


In [22]:
# df[['birthwgt_lb', 'birthwgt_oz', 'totalwgt_lb']]

df.birthwgt_lb + (df.birthwgt_oz / 16.0) # The multiplication operation was done first

0        8.8125
1        7.8750
2        9.1250
3        7.0000
4        6.1875
          ...  
13588    6.1875
13589       NaN
13590       NaN
13591    7.5000
13592    7.5000
Length: 13593, dtype: float64

In [47]:
import pandas as pd

ab = defaultdict(list)
ab

tdict = {
 'brand': ['Honda', 'Toyota', 'Mazda'],
 'model': ['Pleasure', 'Olympus', 'Donut'],
 'year': [2006, 2005, 2004]
 }

cars = pd.DataFrame.from_dict(tdict)
cars

def brandchecker(cars):
    for index, brand in cars.brand.iteritems():
        ab[brand].append(index)
    return ab

brandchecker(cars)

# So what is the purpose of creating a defaultdict?
# defaultdict() initializes the values the first time a given key is used?



defaultdict(list, {'Honda': [0], 'Toyota': [1], 'Mazda': [2]})

Dictionaries are a convenient way to store data for later retrieval by name (key). Keys must be unique, immutable objects, and are typically strings. The values in a dictionary can be anything. For many applications the values are simple types such as integers and strings.

It gets more interesting when the values in a dictionary are collections (lists, dicts, etc.) In this case, the value (an empty list or dict) must be initialized the first time a given key is used. While this is relatively easy to do manually, the defaultdict type automates and simplifies these kinds of operations.

A defaultdict works exactly like a normal dict, but it is initialized with a function (“default factory”) that takes no arguments and provides the default value for a nonexistent key.

------------------

A bit off on the "duck typing" definition -- dict.keys() returns an iterable object, not a list-like object. It will work anywhere an iterable will work -- not any place a list will. a list is also an iterable, but an iterable is NOT a list (or sequence...)

In real use-cases, the most common thing to do with the keys in a dict is to iterate through them, so this makes sense. And if you do need them as a list you can call list().

In [48]:
caseid = 10229
preg_map = nsfg.MakePregMap(df) # what's the purpose of preg_map?
indices = preg_map[caseid]
df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)