In [42]:
from IPython.display import HTML


In [45]:
display(HTML('''
<div>
<h4> Data analysis steps </h4>
<br/>
<nav class="breadcrumb">
<a class="breadcrumb-item" href="home.html">Home</a>
<a class="breadcrumb-item"><strong>Download Data</strong></a>
<a class="breadcrumb-item" href="cleaning_data.html">Data Processing</a>
<a class="breadcrumb-item" href="#">Exploratory Analysis</a>
</nav>
'''))

In [44]:

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Compiling Birth Data Statistics #

The CDC provides a portal to access data from the Division of Vital Statistics. The downloadable data files contains information on: 

* **Birth Data Files/Vital statistics**: A set of aggregated statistics of reported births across the US from 1968 - 2018
* Infant cohort death: A series of period based and cohort based files summarizing infant deaths within respective years and linking deaths to originating birth certificates
* Fetal deaths: A series of aggregated statistics linking birth statistics to fetal/gestational deaths of infants post 20 weeks

For the initial part of the analysis we will only be focussing on the general *birth data files* which provides statistics on birth occurrences within the US, and maternal and infant health at time of birth.


**Source of data can be found here: National Center for Health Statistics (1994-2018)**

https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm

In [9]:
import requests
import urllib.request
import urllib
import os
import time
from bs4 import BeautifulSoup
import ftplib

### Accessing Birth Statistics ###

All birth data files can be downloaded via ftp from the CDC site. The location of each data set is listed in the following website: 

[ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/natality](ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/natality)



In [4]:
url_base = "ftp.cdc.gov"
ftp = ftplib.FTP(url_base, "anonymous")

In [16]:
files_to_dl = [
    f for f in ftp.nlst('pub/Health_Statistics/NCHS/Datasets/DVS/natality') if f.endswith('us.zip')
]
print('Found a total of {0} files from ftp site'.format(len(files_to_dl)))
years_identified = [int(os.path.basename(f).split('Nat')[1].split('us')[0]) for f in files_to_dl]
print('Datasets range from year {0} to year {1}'.format(min(years_identified), max(years_identified)))



Found a total of 24 files from ftp site
Datasets range from year 1994 to year 2018


**We can use the FPT protocol to automatically download the compressed files**

In [None]:
import shutil
import urllib.request as request
from contextlib import closing
import os

for f in files_to_dl:
    print('DLIng', f)
    with closing(request.urlopen(os.path.join('ftp://' + url_base, f))) as r:
        with open(os.path.join('/mnt/nas/natality', os.path.basename(f)), 'wb') as f:
            shutil.copyfileobj(r, f)
    print('done')

### Accessing fetal death info ###

While we will not be using the fetal death info in initial analyses, we will preemptively download and store the raw data defining fetal death. 

Similarly to birth statistics, this data is found via ftp from CDC site:


ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/feataldeathus





In [20]:
fetaldeath_page = 'pub/Health_Statistics/NCHS/Datasets/DVS/fetaldeathus'
ftp = ftplib.FTP(url_base, "anonymous")
files_to_dl = ftp.nlst(fetaldeath_page)

print('Found a total of {0} files from ftp site'.format(len(files_to_dl)))
years_identified = [int(os.path.basename(f).split('Fetal')[1].split('US')[0]) for f in files_to_dl]
print('Fetal death datasets range from year {0} to year {1}'.format(min(years_identified), max(years_identified)))



Found a total of 40 files from ftp site
Fetal death datasets range from year 1982 to year 2017


In [None]:
for f in files_to_dl:
    print('DLIng', f)
    with closing(request.urlopen(os.path.join('ftp://' + url_base, f))) as r:
        with open(os.path.join('/mnt/nas/natality', os.path.basename(f)), 'wb') as f:
            shutil.copyfileobj(r, f)
    print('done')

## Downloading metadata for each dataset ##

The downloaded birth statistics and fetal death files are in standard CSV/TSV/tabulated file formats. There is no indication of what each field/column corresponds to. Instead all characters are dumped to the file and it needs to be parsed to convert the files into readable/analyzable datasets

In this effort, we also need to download **.dct** files which provide descriptions of how to parse the data files and where in the character strings each field is located. 

In order to download the .dct files, we need to scrape links from the following site:

http://data.nber.org/natality

This provides a running list of natality data in each year. As we scrape each year link, we can then traverse into data for that year and download the .dct files of interest

In [95]:
url = 'http://data.nber.org/natality'
import re
html_page = urllib.request.urlopen(url)
soup = BeautifulSoup(html_page)
subpages = []
for link in soup.findAll('a', attrs={'href': re.compile("[12][09][0-9][0-9]/")}):
    subpages.append(os.path.join(url, link.get('href')))

In [96]:
dct_html = []
for s in subpages:
    html_page = urllib.request.urlopen(s)
    soup = BeautifulSoup(html_page)
    count = 0
    for link in soup.findAll('a', attrs={'href': re.compile(".dct")}):
        if 'err' not in link.get('href'):
            assert count == 0, s
            html2 = urllib.request.urlopen(os.path.join(s, link.get('href')))
            count += 1
            txt = BeautifulSoup(html2)
            with open(os.path.join('/mnt/nas/natality', link.get('href')), 'w') as o:
                o.write(str(txt))

The following snippet shows an example of a DCT file and how we can parse the natality files

In [37]:
example_dct = [f for f in os.listdir('/mnt/nas/natality') if f.endswith('.dct') and '2018' in f][0]
with open('/mnt/nas/natality/' + example_dct) as r:
    g = r.readlines()
    data = '\n'.join(g[30:40])
print(data)

_column(119 )   str1 mar_p                               %1s "Paternity Acknowledged Y Yes"          

_column(120 )   byte dmar                                %1f "Marital Status"                        

_column(121 )   byte mar_imp                             %1f "Mother's Marital Status Imputed Blank Marital Status not imputed"

_column(123 )   byte f_mar_p                             %1f "Reporting Flag for Paternity Acknowledged 0 Non-Reporting"

_column(124 )   byte meduc                               %1f "Mother's Education 1 8th grade or less"

_column(126 )   byte f_meduc                             %1f "Reporting Flag for Education of Mother 0 Non-Reporting"

_column(142 )   byte fagerpt_flg                         %1f "Father's Reported Age Used Blank Father's reported age not u"

_column(147 )   byte fagecomb                            %2f "Father's Combined Age (Revised) 09-98 Father's combined age in ye"

_column(149 )   byte fagerec11                           %2f "Fath

## Process downloaded files are group by year ##

After having scraped the web, we have natality data, fetal death data, and *.dct information from years spanning from at least 1994-2018. The files are currently stored in .zip file and segregated. 

In the next steps we simpley want to: 

1. Unzip all Nat*.zip files which corresponds to natality data
2. Unzip all Fetal*.zip files which corresponds to fetal death data
3. Extract years from each of the extracted files and group them together into the same folder by year
4. Move the *.dct file corresponding to a specific year into the same folder


In [None]:
import zipfile
import warnings
for f in os.listdir('/mnt/nas/natality'):
    if os.path.isdir(os.path.join('/mnt/nas/natality', f)):
        continue    
    is_dct = False
    if f.startswith('Fetal'):
        folder = f.split('Fetal')[1].split('US')[0]
        year = int(folder)        
    elif f.endswith('.dct'):        
        folder = f.split('natl')[1].split('.dct')[0]
        is_dct = True
        year = int(folder)
    elif f.startswith('Nat'):
        folder = f.split('Nat')[1].split('us')[0]
        year = int(folder)
    else:
        raise Exception(f)
    
    if not os.path.isdir(os.path.join('/mnt/nas/natality', folder)):
        os.mkdir(os.path.join('/mnt/nas/natality', folder))
    
    if is_dct:
        shutil.copy(os.path.join('/mnt/nas/natality', f), os.path.join('/mnt/nas/natality', folder, f))
    elif f.lower().endswith('.zip'):        
        with zipfile.ZipFile(os.path.join('/mnt/nas/natality', f), 'r') as zip_ref:
            for f2 in zip_ref.filelist:
                if os.path.isfile(os.path.join('/mnt/nas/natality', folder, f2.filename)):
                    print(f2.filename, 'already exists!')
                    continue
                else:                    
                    print('unzipping ', f, f2.filename)
                    try:
                        zip_ref.extract(f2, os.path.join('/mnt/nas/natality/', folder))            
                    except NotImplementedError as e:  
                        # try 7z -o{folder} e f
                        warnings.warn(str(('failed unzipping', f, f2.filename)))
        
        
    

**Final folder structure post download and  extraction**

In [40]:
!tree '/mnt/nas/natality/'

[01;34m/mnt/nas/natality/[00m
├── [01;34m1968[00m
│   └── natl1968.dct
├── [01;34m1969[00m
│   └── natl1969.dct
├── [01;34m1970[00m
│   └── natl1970.dct
├── [01;34m1971[00m
│   └── natl1971.dct
├── [01;34m1972[00m
│   └── natl1972.dct
├── [01;34m1973[00m
│   └── natl1973.dct
├── [01;34m1974[00m
│   └── natl1974.dct
├── [01;34m1975[00m
│   └── natl1975.dct
├── [01;34m1976[00m
│   └── natl1976.dct
├── [01;34m1977[00m
│   └── natl1977.dct
├── [01;34m1978[00m
│   └── natl1978.dct
├── [01;34m1979[00m
│   └── natl1979.dct
├── [01;34m1980[00m
│   └── natl1980.dct
├── [01;34m1981[00m
│   └── natl1981.dct
├── [01;34m1982[00m
│   ├── natl1982.dct
│   └── VS82FETL.DETUSPUB
├── [01;34m1983[00m
│   ├── natl1983.dct
│   └── VS83FETL.DETUSPUB
├── [01;34m1984[00m
│   ├── natl1984.dct
│   └── VS84FETL.DETUSPUB
├── [01;34m1985[00m
│   ├── natl1985.dct
│   └── VS85FETL.DETUSPUB
├── [01;34m1986[00m
│   ├── natl1986.dct
│   └── VS86FETL.DETUSPUB
├── [01;34m1987[00m