In [None]:
#!/usr/bin/env python3

## Copyright (c)
##    2017 by The University of Delaware
##    Contributors: Michael Wyatt
##    Affiliation: Global Computing Laboratory, Michela Taufer PI
##    Url: http://gcl.cis.udel.edu/, https://github.com/TauferLab
##
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
##    1. Redistributions of source code must retain the above copyright notice,
##    this list of conditions and the following disclaimer.
##
##    2. Redistributions in binary form must reproduce the above copyright
##    notice, this list of conditions and the following disclaimer in the
##    documentation and/or other materials provided with the distribution.
##
##    3. If this code is used to create a published work, one or both of the
##    following papers must be cited.
##
##            M. Wyatt, T. Johnston, M. Papas, and M. Taufer.  Development of a
##            Scalable Method for Creating Food Groups Using the NHANES Dataset
##            and MapReduce.  In Proceedings of the ACM Bioinformatics and
##            Computational Biology Conference (BCB), pp. 1 - 10. Seattle, WA,
##            USA. October 2 - 4, 2016.
##
##    4.  Permission of the PI must be obtained before this software is used
##    for commercial purposes.  (Contact: taufer@acm.org)
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
## AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
## ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
## INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
## CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
## POSSIBILITY OF SUCH DAMAGE.

In [None]:
import os
import argparse
import urllib.request
import re
import json
import functools
from multiprocessing import Pool
import time


def conditionalMkdir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

try:
        from BeautifulSoup import BeautifulSoup
except ImportError:
        from bs4 import BeautifulSoup

In [None]:
# import requests

# year = str(2013)
# URL = "https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=" + year

# #page = requests.get(URL)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
year = 2017

In [None]:
os.mkdir('/content/drive/MyDrive/Hypertension_Final_Project/data_' + str(year))

file_base_link = 'https://wwwn.cdc.gov'
file_dir = "/content/drive/MyDrive/Hypertension_Final_Project/data_" + str(year)

In [None]:
import requests

base_link = 'https://wwwn.cdc.gov/nchs/nhanes'
URL = 'https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=' + str(year)

page = requests.get(URL)

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
xpt_urls = soup.findAll('a', class_="list-title td-none td-ul-hover")

In [None]:
file_link = []

for url in xpt_urls:
  if url.has_attr('href'):
    if 'Component=' in url['href']:
      file_link.append((url['href']))

In [None]:
file_link

['../search/datapage.aspx?Component=Demographics&CycleBeginYear=2017',
 '../search/datapage.aspx?Component=Dietary&CycleBeginYear=2017',
 '../search/datapage.aspx?Component=Examination&CycleBeginYear=2017',
 '../search/datapage.aspx?Component=Laboratory&CycleBeginYear=2017',
 '../search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2017',
 '../search/datapage.aspx?Component=LimitedAccess&CycleBeginYear=2017']

In [None]:
# read each file
Demographics = requests.get(base_link + file_link[0][2:])
print(Demographics)

Demographics_content = BeautifulSoup(Demographics.content, 'html.parser')
file_urls = Demographics_content.findAll('td', class_="text-center")

demo_links = []

for url in file_urls:
  k = url.findAll('a')
  for i in k:
    if '.XPT' in i['href']:
      demo_links.append((i['href']))

<Response [200]>


In [None]:
for demo_link in demo_links:
  file_url = file_base_link + demo_link
  # Get name for file
  file_name = demo_link.split('/')[-1]
  file_loc = os.path.join(file_dir, file_name)

  # Check that file does not already exist
  if not os.path.isfile(file_loc):
      print('Getting file: %s' % file_url)
      # Download the file and write to local
      urllib.request.urlretrieve(file_url, file_loc)

Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT


In [None]:
# read each file
Examination = requests.get(base_link + file_link[2][2:])
print(Examination)

Examination_content = BeautifulSoup(Examination.content, 'html.parser')
file_urls = Examination_content.findAll('td', class_="text-center")

Exam_links = []

for url in file_urls:
  k = url.findAll('a')
  for i in k:
    if '.XPT' in i['href']:
      Exam_links.append((i['href']))

print(Exam_links)

<Response [200]>
['/Nchs/Nhanes/2017-2018/AUX_J.XPT', '/Nchs/Nhanes/2017-2018/AUXAR_J.XPT', '/Nchs/Nhanes/2017-2018/AUXTYM_J.XPT', '/Nchs/Nhanes/2017-2018/AUXWBR_J.XPT', '/Nchs/Nhanes/2017-2018/BPX_J.XPT', '/Nchs/Nhanes/2017-2018/BPXO_J.XPT', '/Nchs/Nhanes/2017-2018/BMX_J.XPT', '/Nchs/Nhanes/2017-2018/DXXAG_J.XPT', '/Nchs/Nhanes/2017-2018/DXXFEM_J.XPT', '/Nchs/Nhanes/2017-2018/DXXSPN_J.XPT', '/Nchs/Nhanes/2017-2018/DXX_J.XPT', '/Nchs/Nhanes/2017-2018/LUX_J.XPT', '/Nchs/Nhanes/2017-2018/OHXDEN_J.XPT', '/Nchs/Nhanes/2017-2018/OHXREF_J.XPT']


In [None]:
required = ['BPX', 'BMX']

In [None]:
exam_links_filter = []
for i in Exam_links:
  if i.split('/')[-1].split('_')[0] in required:
    exam_links_filter.append(i)

In [None]:
exam_links_filter

['/Nchs/Nhanes/2017-2018/BPX_J.XPT', '/Nchs/Nhanes/2017-2018/BMX_J.XPT']

In [None]:
for link in exam_links_filter:
  file_url = file_base_link + link
  # Get name for file
  file_name = link.split('/')[-1]
  file_loc = os.path.join(file_dir, file_name)

  # Check that file does not already exist
  if not os.path.isfile(file_loc):
      print('Getting file: %s' % file_url)
      # Download the file and write to local
      urllib.request.urlretrieve(file_url, file_loc)

Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPX_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT


In [None]:
import pandas as pd

In [None]:
test = pd.read_sas(filepath_or_buffer = file_loc)
test.head()

Unnamed: 0,SEQN,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,...,BMXLEG,BMILEG,BMXARML,BMIARML,BMXARMC,BMIARMC,BMXWAIST,BMIWAIST,BMXHIP,BMIHIP
0,93703.0,1.0,13.7,3.0,89.6,,,,88.6,,...,,,18.0,,16.2,,48.2,,,
1,93704.0,1.0,13.9,,95.0,,,,94.2,,...,,,18.6,,15.2,,50.0,,,
2,93705.0,1.0,79.5,,,,,,158.3,,...,37.0,,36.0,,32.0,,101.8,,110.0,
3,93706.0,1.0,66.3,,,,,,175.7,,...,46.6,,38.8,,27.0,,79.3,,94.4,
4,93707.0,1.0,45.4,,,,,,158.4,,...,38.1,,33.8,,21.5,,64.1,,83.0,


In [None]:
# read each file
Dietary = requests.get(base_link + file_link[1][2:])
print(Dietary)

Dietary_content = BeautifulSoup(Dietary.content, 'html.parser')
file_urls = Dietary_content.findAll('td', class_="text-center")

Dietary_links = []

for url in file_urls:
  k = url.findAll('a')
  for i in k:
    if '.XPT' in i['href']:
      Dietary_links.append((i['href']))

print(Dietary_links)

<Response [200]>
['/Nchs/Nhanes/2017-2018/DR1IFF_J.XPT', '/Nchs/Nhanes/2017-2018/DR2IFF_J.XPT', '/Nchs/Nhanes/2017-2018/DR1TOT_J.XPT', '/Nchs/Nhanes/2017-2018/DR2TOT_J.XPT', '/Nchs/Nhanes/2017-2018/DRXFCD_J.XPT', '/Nchs/Nhanes/1999-2000/DSBI.XPT', '/Nchs/Nhanes/1999-2000/DSII.XPT', '/Nchs/Nhanes/1999-2000/DSPI.XPT', '/Nchs/Nhanes/2017-2018/DS1IDS_J.XPT', '/Nchs/Nhanes/2017-2018/DS2IDS_J.XPT', '/Nchs/Nhanes/2017-2018/DS1TOT_J.XPT', '/Nchs/Nhanes/2017-2018/DS2TOT_J.XPT', '/Nchs/Nhanes/2017-2018/DSQIDS_J.XPT', '/Nchs/Nhanes/2017-2018/DSQTOT_J.XPT']


In [None]:
required = ['DR1TOT', 'DR2TOT']

In [None]:
Dietary_links_filter = []
for i in Dietary_links:
  if i.split('/')[-1].split('_')[0] in required:
    Dietary_links_filter.append(i)

print(Dietary_links_filter)

['/Nchs/Nhanes/2017-2018/DR1TOT_J.XPT', '/Nchs/Nhanes/2017-2018/DR2TOT_J.XPT']


In [None]:
for link in Dietary_links_filter:
  file_url = file_base_link + link
  # Get name for file
  file_name = link.split('/')[-1]
  file_loc = os.path.join(file_dir, file_name)

  # Check that file does not already exist
  if not os.path.isfile(file_loc):
      print('Getting file: %s' % file_url)
      # Download the file and write to local
      urllib.request.urlretrieve(file_url, file_loc)

Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR1TOT_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR2TOT_J.XPT


In [None]:
test = pd.read_sas(filepath_or_buffer = file_loc)
test.head()

Unnamed: 0,SEQN,WTDRD1,WTDR2D,DR2DRSTZ,DR2EXMER,DRABF,DRDINT,DR2DBIH,DR2DAY,DR2LANG,...,DR2TP184,DR2TP204,DR2TP205,DR2TP225,DR2TP226,DR2_300,DR2_320Z,DR2_330Z,DR2BWATZ,DR2TWSZ
0,93703.0,5.397605e-79,,5.0,,,,,,,...,,,,,,,,,,
1,93704.0,81714.01,82442.87,1.0,87.0,2.0,2.0,18.0,6.0,1.0,...,0.003,0.041,5.397605e-79,0.002,0.01,2.0,5.397605e-79,5.397605e-79,5.397605e-79,91.0
2,93705.0,7185.561,5640.391,1.0,91.0,2.0,2.0,15.0,4.0,1.0,...,0.001,0.201,0.002,0.011,0.059,2.0,960.0,5.397605e-79,960.0,1.0
3,93706.0,6463.883,5.397605e-79,5.0,,2.0,1.0,,,,...,,,,,,,,,,
4,93707.0,15333.78,22707.07,1.0,78.0,2.0,2.0,22.0,3.0,1.0,...,0.001,0.327,0.004,0.025,0.061,3.0,1020.0,780.0,240.0,99.0


In [None]:
# read each file
Questionnaire = requests.get(base_link + file_link[4][2:])
print(Questionnaire)

Questionnaire_content = BeautifulSoup(Questionnaire.content, 'html.parser')
file_urls = Questionnaire_content.findAll('td', class_="text-center")

Questionnaire_links = []

for url in file_urls:
  k = url.findAll('a')
  for i in k:
    if '.XPT' in i['href']:
      Questionnaire_links.append((i['href']))

print(Questionnaire_links)

<Response [200]>
['/Nchs/Nhanes/2017-2018/ACQ_J.XPT', '/Nchs/Nhanes/2017-2018/ALQ_J.XPT', '/Nchs/Nhanes/2017-2018/AUQ_J.XPT', '/Nchs/Nhanes/2017-2018/BPQ_J.XPT', '/Nchs/Nhanes/2017-2018/CDQ_J.XPT', '/Nchs/Nhanes/2017-2018/CBQ_J.XPT', '/Nchs/Nhanes/2017-2018/CBQPFA_J.XPT', '/Nchs/Nhanes/2017-2018/CBQPFC_J.XPT', '/Nchs/Nhanes/2017-2018/HSQ_J.XPT', '/Nchs/Nhanes/2017-2018/DEQ_J.XPT', '/Nchs/Nhanes/2017-2018/DIQ_J.XPT', '/Nchs/Nhanes/2017-2018/DBQ_J.XPT', '/Nchs/Nhanes/2017-2018/DLQ_J.XPT', '/Nchs/Nhanes/2017-2018/DUQ_J.XPT', '/Nchs/Nhanes/2017-2018/ECQ_J.XPT', '/Nchs/Nhanes/2017-2018/FSQ_J.XPT', '/Nchs/Nhanes/2017-2018/HIQ_J.XPT', '/Nchs/Nhanes/2017-2018/HEQ_J.XPT', '/Nchs/Nhanes/2017-2018/HUQ_J.XPT', '/Nchs/Nhanes/2017-2018/HOQ_J.XPT', '/Nchs/Nhanes/2017-2018/IMQ_J.XPT', '/Nchs/Nhanes/2017-2018/INQ_J.XPT', '/Nchs/Nhanes/2017-2018/KIQ_U_J.XPT', '/Nchs/Nhanes/2017-2018/MCQ_J.XPT', '/Nchs/Nhanes/2017-2018/DPQ_J.XPT', '/Nchs/Nhanes/2017-2018/OCQ_J.XPT', '/Nchs/Nhanes/2017-2018/OHQ_J.XPT', '/

In [None]:
required = ['ALQ', 'BPQ', 'CBQ', 'DBQ', 'DPQ', 'DIQ', 'HIQ', 'HSQ', 'OCQ', 'MCQ', 'PAQ', 'SLQ', 'SMQ', 'SMQFAM']

In [None]:
Questionnaire_links_filter = []
for i in Questionnaire_links:
  if i.split('/')[-1].split('_')[0] in required:
    Questionnaire_links_filter.append(i)

print(Questionnaire_links_filter)

['/Nchs/Nhanes/2017-2018/ALQ_J.XPT', '/Nchs/Nhanes/2017-2018/BPQ_J.XPT', '/Nchs/Nhanes/2017-2018/CBQ_J.XPT', '/Nchs/Nhanes/2017-2018/HSQ_J.XPT', '/Nchs/Nhanes/2017-2018/DIQ_J.XPT', '/Nchs/Nhanes/2017-2018/DBQ_J.XPT', '/Nchs/Nhanes/2017-2018/HIQ_J.XPT', '/Nchs/Nhanes/2017-2018/MCQ_J.XPT', '/Nchs/Nhanes/2017-2018/DPQ_J.XPT', '/Nchs/Nhanes/2017-2018/OCQ_J.XPT', '/Nchs/Nhanes/2017-2018/PAQ_J.XPT', '/Nchs/Nhanes/2017-2018/SLQ_J.XPT', '/Nchs/Nhanes/2017-2018/SMQ_J.XPT', '/Nchs/Nhanes/2017-2018/SMQFAM_J.XPT']


In [None]:
len(required)

14

In [None]:
len(Questionnaire_links_filter)

14

In [None]:
for link in Questionnaire_links_filter:
  file_url = file_base_link + link
  # Get name for file
  file_name = link.split('/')[-1]
  file_loc = os.path.join(file_dir, file_name)

  # Check that file does not already exist
  if not os.path.isfile(file_loc):
      print('Getting file: %s' % file_url)
      # Download the file and write to local
      urllib.request.urlretrieve(file_url, file_loc)
      time.sleep(10)

Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/ALQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/CBQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HSQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DIQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DBQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HIQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/MCQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DPQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/OCQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/PAQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SLQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SMQ_J.XPT
Getting file: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SMQFAM_J.XPT


In [None]:
test = pd.read_sas(filepath_or_buffer = file_loc)
test.head()

Unnamed: 0,SEQN,SMD460,SMD470,SMD480
0,93703.0,5.397605e-79,,
1,93704.0,5.397605e-79,,
2,93705.0,5.397605e-79,,
3,93706.0,,,
4,93707.0,1.0,1.0,5.0


In [None]:
entries = [entry for entry in os.listdir(file_dir) if os.path.isfile(os.path.join(file_dir, entry))]

In [None]:
len(entries)

19

In [None]:
file_dir = '/content/drive/MyDrive/Hypertension_Final_Project/data_2017'

In [None]:
# rename files

for filename in os.listdir(file_dir):
  if '_' in filename:
    name = '/' + filename.split('_')[0] + '.' + filename.split('_')[1].split('.')[-1]
    os.rename(file_dir + '/' + filename, file_dir + name)