In [1]:
import sys

import numpy as np
import pandas as pd

## Read data from fix width formatted file

In [2]:
sys.path.insert(0, '..')
import Modules.read_fwf as rfwf

In [3]:
def ReadFemPreg(dct_file, dat_file):
    """
    reads the NSFG pregnancy data.

    :param dct_file: string file name
    :param dat_file: string file name
    :return: DataFrame
    """
    dct = rfwf.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file)

    return df

In [4]:
pregnancy_data = ReadFemPreg(dct_file='../Data/2017_2019_FemPregSetup.dct',
                             dat_file='../Data/2017_2019_FemPregData.dat')

In [5]:
pregnancy_data

Unnamed: 0,caseid,pregordr,moscurrp,pregend1,pregend2,hpageend,kidage,hpagelb,paybirth1,paybirth2,...,secu,sest,cmintvw,cmlstyr,cmjan3yr,cmjan4yr,cmjan5yr,quarter,phase,intvwyear
0,88819,1,,6.0,,,7.0,4.0,,,...,2,354,1427,1415,1381,1369,1357,29,1,2018
1,88819,2,,5.0,,,6.0,4.0,,,...,2,354,1427,1415,1381,1369,1357,29,1,2018
2,83055,1,,5.0,,,6.0,4.0,,,...,2,354,1425,1413,1381,1369,1357,29,1,2018
3,83055,2,,5.0,,,6.0,5.0,,,...,2,354,1425,1413,1381,1369,1357,29,1,2018
4,92062,1,,6.0,,,7.0,4.0,,,...,2,354,1426,1414,1381,1369,1357,29,1,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10210,91420,4,,6.0,,,6.0,6.0,,,...,1,370,1432,1420,1393,1381,1369,31,1,2019
10211,89157,1,,6.0,,,6.0,3.0,,,...,1,370,1433,1421,1393,1381,1369,31,1,2019
10212,89157,2,,6.0,,,6.0,4.0,,,...,1,370,1433,1421,1393,1381,1369,31,1,2019
10213,89157,3,,5.0,,,5.0,4.0,1.0,2.0,...,1,370,1433,1421,1393,1381,1369,31,1,2019


In [6]:
pd.options.display.max_rows = 171
pregnancy_data.head(5).T

Unnamed: 0,0,1,2,3,4
caseid,88819.0,88819.0,83055.0,83055.0,92062.0
pregordr,1.0,2.0,1.0,2.0,1.0
moscurrp,,,,,
pregend1,6.0,5.0,5.0,5.0,6.0
pregend2,,,,,
hpageend,,,,,
kidage,7.0,6.0,6.0,6.0,7.0
hpagelb,4.0,4.0,4.0,5.0,4.0
paybirth1,,,,,
paybirth2,,,,,


## Do some cleaning on data (transformation & validation)

In [7]:
na_values = [97, 98, 99]

pregnancy_data['hpagelb'].replace(na_values, inplace=True)
pregnancy_data['cmintvw'] = np.nan

In [28]:
features = ['pregordr', 'hpagelb', 'csecprim', 'priorsmk', 'outcome', 'agecon', 'learnprg', 'pncarewk', 'lbw1',
            'postsmks', 'npostsmk', 'feelinpg', 'gest_lb', 'gest_othr', 'agepreg', 'ager', 'educat', 'hieduc',
            'race', 'pregnum', 'parity', 'poverty', 'laborfor', 'religion', 'metro']
pregnancy_data[features]

Unnamed: 0,pregordr,hpagelb,csecprim,priorsmk,outcome,agecon,learnprg,pncarewk,lbw1,postsmks,...,ager,educat,hieduc,race,pregnum,parity,poverty,laborfor,religion,metro
0,1,4.0,,,1,29,,,2.0,,...,43,16,12,2,2,2,641,1,1,2
1,2,4.0,,,1,31,,,2.0,,...,43,16,12,2,2,2,641,1,1,2
2,1,4.0,,,1,29,,,2.0,,...,41,16,12,2,2,2,498,1,3,2
3,2,5.0,,,1,33,,,2.0,,...,41,16,12,2,2,2,498,1,3,2
4,1,4.0,,,1,27,,,2.0,,...,41,16,12,2,3,2,498,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10210,4,6.0,,,1,39,,,2.0,,...,47,13,10,3,4,2,275,7,3,2
10211,1,3.0,,,1,26,,,1.0,,...,37,15,10,3,3,3,90,7,3,2
10212,2,4.0,,,1,29,,,1.0,,...,37,15,10,3,3,3,90,7,3,2
10213,3,4.0,1.0,0.0,1,32,1.0,1.0,1.0,5.0,...,37,15,10,3,3,3,90,7,3,2


## validation

In [29]:
val_dict = {}

for feat in features:
    val_dict[feat] = pregnancy_data[feat].value_counts()

In [30]:
val_dict['pregordr']

1     3709
2     2806
3     1776
4      952
5      481
6      227
7      115
8       68
9       35
10      22
11      10
12       7
13       5
14       2
Name: pregordr, dtype: int64

In [31]:
val_dict['outcome']

1    7199
4    1654
2     941
6     193
5     123
3     105
Name: outcome, dtype: int64

In [32]:
val_dict['pregnum']

3     2472
2     2058
4     1884
5     1265
1      903
6      677
7      322
8      271
10     120
9      117
13      39
11      33
14      28
12      24
0        2
Name: pregnum, dtype: int64