## Pandas

### Dataframe

In [1]:
import pandas as pd
from statadict import parse_stata_dict

# source :https://ftp.cdc.gov/pub/health_statistics/nchs/datasets/NSFG/stata/
# codebook : https://www.cdc.gov/nchs/data/nsfg/2015-2017_NSFG_FemPregFile_Codebook-508.pdf
data_file = "2015_2017_FemPregData.dat"
dict_file = "2015_2017_FemPregSetup.dct"

stata_dict = parse_stata_dict(dict_file)

# print(stata_dict.names)
nsfg = pd.read_fwf(data_file,
                  names=stata_dict.names,
                  colspecs=stata_dict.colspecs)
print(type(nsfg))
nsfg.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,CASEID,PREGORDR,HOWPREG_N,HOWPREG_P,MOSCURRP,NOWPRGDK,PREGEND1,PREGEND2,HOWENDDK,NBRNALIV,...,SECU,SEST,CMINTVW,CMLSTYR,CMJAN3YR,CMJAN4YR,CMJAN5YR,QUARTER,PHASE,INTVWYEAR
0,70627,1,,,,,6.0,,,1.0,...,3,322,1394,1382,1357,1345,1333,18,1,2016
1,70627,2,,,,,1.0,,,,...,3,322,1394,1382,1357,1345,1333,18,1,2016
2,70627,3,,,,,6.0,,,1.0,...,3,322,1394,1382,1357,1345,1333,18,1,2016
3,70628,1,,,,,6.0,,,1.0,...,2,366,1409,1397,1369,1357,1345,23,1,2017
4,70628,2,,,,,6.0,,,1.0,...,2,366,1409,1397,1369,1357,1345,23,1,2017


In [2]:
nsfg.iloc[:5,:9]

Unnamed: 0,CASEID,PREGORDR,HOWPREG_N,HOWPREG_P,MOSCURRP,NOWPRGDK,PREGEND1,PREGEND2,HOWENDDK
0,70627,1,,,,,6.0,,
1,70627,2,,,,,1.0,,
2,70627,3,,,,,6.0,,
3,70628,1,,,,,6.0,,
4,70628,2,,,,,6.0,,


In [3]:
nsfg.shape

(9553, 248)

there are 10215 rows in this dataset,
one for each pregnancy, and 171 columns

In [4]:
nsfg.columns

Index(['CASEID', 'PREGORDR', 'HOWPREG_N', 'HOWPREG_P', 'MOSCURRP', 'NOWPRGDK',
       'PREGEND1', 'PREGEND2', 'HOWENDDK', 'NBRNALIV',
       ...
       'SECU', 'SEST', 'CMINTVW', 'CMLSTYR', 'CMJAN3YR', 'CMJAN4YR',
       'CMJAN5YR', 'QUARTER', 'PHASE', 'INTVWYEAR'],
      dtype='object', length=248)

### Series

In [5]:
# pd.set_option("display.max.columns", None)
# nsfg.head()
pounds = nsfg["BIRTHWGT_LB1"]
print(pounds)
print(type(pounds))
pounds.head()

0       7.0
1       NaN
2       9.0
3       6.0
4       7.0
       ... 
9548    7.0
9549    NaN
9550    8.0
9551    6.0
9552    NaN
Name: BIRTHWGT_LB1, Length: 9553, dtype: float64
<class 'pandas.core.series.Series'>


0    7.0
1    NaN
2    9.0
3    6.0
4    7.0
Name: BIRTHWGT_LB1, dtype: float64

In [6]:
ounces = nsfg["BIRTHWGT_OZ1"]
print(ounces)

0        8.0
1        NaN
2        2.0
3        9.0
4        0.0
        ... 
9548     2.0
9549     NaN
9550    12.0
9551    13.0
9552     NaN
Name: BIRTHWGT_OZ1, Length: 9553, dtype: float64


## Validation

In [7]:
pounds.value_counts().sort_index()

BIRTHWGT_LB1
0.0        2
1.0       28
2.0       46
3.0       76
4.0      179
5.0      570
6.0     1644
7.0     2268
8.0     1287
9.0      396
10.0      82
11.0      17
12.0       2
13.0       1
14.0       1
98.0       2
99.0      89
Name: count, dtype: int64

In [8]:
pounds.describe()

count    6690.000000
mean        8.008819
std        10.771360
min         0.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        99.000000
Name: BIRTHWGT_LB1, dtype: float64

In [9]:
import numpy as np
pounds_clean = pounds.replace([98, 99], np.nan)

In [10]:
pounds_clean.describe()

count    6599.000000
mean        6.754357
std         1.383268
min         0.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        14.000000
Name: BIRTHWGT_LB1, dtype: float64

Exercise: Use describe to summarize ounces.
Then use replace to replace the special values 98 and 99 with NaN, and assign the result to ounces_clean. Run describe again. How much does this cleaning affect the results?

### Series Arithmetic

In [11]:
ounces_clean = pounds_clean * 16 + ounces
print(ounces_clean)

0       120.0
1         NaN
2       146.0
3       105.0
4       112.0
        ...  
9548    114.0
9549      NaN
9550    140.0
9551    109.0
9552      NaN
Length: 9553, dtype: float64


Exercise: Use pounds_clean and ounces_clean to compute the total birth weight expressed in kilograms (there are roughly 2.2 pounds per kilogram). What is the mean birth weight in kilograms?

In [12]:
birth_weight = pounds_clean + ounces_clean / 16
birth_weight.mean()

13.984647295044704

## Boolean Series

In [14]:
preterm = (nsfg["PRGLNGTH"] < 37)
preterm.head()

0    False
1     True
2    False
3    False
4    False
Name: PRGLNGTH, dtype: bool

In [15]:
preterm.sum()

3675

In [17]:
preterm.mean()

0.38469590704490736

In [19]:
live = (nsfg["OUTCOME"] == 1)
live.mean()

0.7006176070344394

In [22]:
live_preterm = (live & preterm)
live_preterm.mean()

0.08929132209777034

## Filtering Data

In [26]:
preterm.mean()
# preterm_weight = birth_weight[preterm]
# preterm_weight.mean()

0.38469590704490736