In [None]:
!pip install thinkx

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
import thinkstats2

In [None]:
dataset_dir = '../datasets/pregnancies'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [None]:
%%bash
cd ../datasets/pregnancies
rm -f 2002FemPreg.dat.gz 2002FemPreg.dct
wget https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dat.gz
wget https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dct

In [None]:
dct = thinkstats2.ReadStataDct(os.path.join(dataset_dir, '2002FemPreg.dct'), encoding='iso-8859-1')
df = dct.ReadFixedWidth(os.path.join(dataset_dir, '2002FemPreg.dat.gz'), compression='gzip')

In [None]:
# Source: https://github.com/AllenDowney/ThinkStats2/blob/master/code/nsfg2.py#L47
def CleanFemPreg(df):
    """Recodes variables from the pregnancy frame.
    df: DataFrame
    """
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0

    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)

    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.phase = np.nan
    return df

df = CleanFemPreg(df)

In [None]:
df = df[df['outcome'].isin([1, 3, 4])]
df['outcome'] = df['outcome'] == 1
df = df[[ c for c in df.columns.values if c != 'outcome' ] + ['outcome']]

In [None]:
g = df.groupby('outcome')
df = g.apply(lambda x: x.sample(g.size().min(), random_state=1)).reset_index(drop=True)

In [None]:
df = df.sort_values('cmprgend')

In [None]:
df = df[['parity', 'agecon', 'pregordr', 'poverty', 'educat', 'outcome']]

In [None]:
df['parity'] = df['parity'].apply(lambda x: max(x-1, 0))

In [None]:
df.to_csv(os.path.join(dataset_dir, 'pregnancies.csv'), index=False)