In [None]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import urllib.request
import time
import re

import warnings
warnings.filterwarnings('ignore')

base_path = "/content/drive/MyDrive/Temp Folder"
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
start_overall = time.time()

In [None]:
start = time.time()
loop_num = 0

# Get Dataset HTML
dataset_html = []
for name in dataset_names:
  time.sleep(0.5)
  url = base_url + "&Component=" + name
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5##.## (KHTML, like Gecko) Chrome/1##.#.#.# Safari/5##.##"
  }
  response = requests.get(url, headers=headers)
  soup = BeautifulSoup(response.content, "html.parser")
  dataset_html.append({'name':name, 'soup':soup})
  loop_num += 1

end = time.time()
duration = end - start
print("Loop Num: ", loop_num)
print("Time Taken:", duration, "Second")

Loop Num:  5
Time Taken: 10.709123611450195 Second


In [None]:
dataset_information = []
for dataset in dataset_html:
  data_title = []
  for td in dataset['soup'].findAll('td', class_="text-left"):
    data_title.append(td.text)

  data_doc = []
  data_xpt = []
  for an in dataset['soup'].findAll('a'):
    if("/Nchs/Nhanes/" in an['href'] and ".htm" in an['href'].lower()):
      data_doc.append(an['href'])
    if("/Nchs/Nhanes/" in an['href'] and ".xpt" in an['href'].lower()):
      data_xpt.append(an['href'])

  dataset_information.append({'name': dataset['name'], 'titles': data_title, 'docs': data_doc, 'xpts': data_xpt})

for dataset in dataset_information:
  print("Name:", dataset['name'])
  print("Num of Dataset Title: ", str(len(dataset['titles'])))
  print("Num of Dataset Docs: ", str(len(dataset['docs'])))
  print("Num of Dataset XPTs: ", str(len(dataset['xpts'])))
  print("-"*10)
  print()

Name: Demographics
Num of Dataset Title:  1
Num of Dataset Docs:  1
Num of Dataset XPTs:  1
----------

Name: Dietary
Num of Dataset Title:  14
Num of Dataset Docs:  14
Num of Dataset XPTs:  14
----------

Name: Examination
Num of Dataset Title:  14
Num of Dataset Docs:  14
Num of Dataset XPTs:  14
----------

Name: Laboratory
Num of Dataset Title:  53
Num of Dataset Docs:  53
Num of Dataset XPTs:  53
----------

Name: Questionnaire
Num of Dataset Title:  44
Num of Dataset Docs:  44
Num of Dataset XPTs:  44
----------



In [None]:
start = time.time()
loop_num = 0

# Get Docs HTML
docs_html = []
for dataset in dataset_information:
  for title, doc in zip(dataset['titles'], dataset['docs']):
    docs_url = doc

    time.sleep(0.5)
    url = "https://wwwn.cdc.gov/" + docs_url
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    docs_html.append({'name':dataset['name'], 'doc_url': docs_url, 'titles':title, 'soup':soup})
    loop_num += 1

end = time.time()
duration = end - start
print("Loop Num: ", loop_num)
print("Time Taken:", duration, "Second")

len(docs_html)

Loop Num:  126
Time Taken: 148.33208775520325 Second


126

In [None]:
# Get Variable List for Each dataset
datasets_variable = []
for doc in docs_html:
  variables = []
  for title in doc['soup'].findAll('h3', class_="vartitle"):
    var_name = title['id']
    var_desc = re.sub(title['id']+" - ", "", title.text)
    missing_value_row = title.find_next(lambda tag: tag.name == 'td' and 'Missing' in tag.text)
    missing_count = missing_value_row.find_next('td')
    total_count = missing_count.find_next('td')

    variables.append({'var_name':var_name, 'var_desc':var_desc, 'missing_count':missing_count.text, 'total_count': total_count.text})

  datasets_variable.append({'name': doc['name'], 'title':doc['titles'], 'doc_url': doc['doc_url'], 'variables':variables})

print(len(datasets_variable))

dataset_variable_df = pd.DataFrame({'name':[], 'title':[], 'doc_url':[], 'variable':[], 'desc':[], 'missing_count':[], 'total_count':[]})

for dataset in datasets_variable:
  for var in dataset['variables']:
    temp = pd.DataFrame([{
        'name': dataset['name'],
        'title': dataset['title'],
        'doc_url': dataset['doc_url'],
        'variable': var['var_name'],
        'desc': var['var_desc'],
        'missing_count': var['missing_count'],
        'total_count': var['total_count']
      }])
    dataset_variable_df = pd.concat([dataset_variable_df, temp], axis=0)

dataset_variable_df = dataset_variable_df.reset_index(drop=True)
dataset_variable_df.to_excel(base_path + '/Dataset/Variable List.xlsx', index=False)

In [None]:
start = time.time()
loop_num = 0

xpt_paths = []

print("Downloading Data...")

for dataset in dataset_information:
  dir_path = base_path + "/Dataset/Raw XPT/" + dataset['name']
  if(not os.path.exists(dir_path)):
    os.mkdir(dir_path)

  for title, xpt in zip(dataset['titles'], dataset['xpts']):
    file_path = dir_path+"/"+(title.replace("/", " ").strip())+".XPT"

    time.sleep(0.5)
    url = "https://wwwn.cdc.gov" + xpt
    urllib.request.urlretrieve(url, file_path)

    xpt_paths.append({'name': dataset['name'], 'titles':title, 'file_path':file_path})
    print("URL: ", url)
    print("File Path: ", file_path)
    print()
    loop_num += 1

for xpt in xpt_paths:
  dir_path = base_path + "/Dataset/Raw CSV/" + xpt['name']
  if(not os.path.exists(dir_path)):
    os.mkdir(dir_path)

  file_path = dir_path+"/"+(xpt['titles'].replace("/", " ").strip())+".csv"

  data_df = pd.read_sas(xpt['file_path'])

  data_df.to_csv(file_path, index=False)

  csv_paths.append({'name': xpt['name'], 'titles':xpt['titles'], 'file_path':file_path})
  print("URL: ", url)
  print("File Path: ", file_path)
  print()
  loop_num += 1


end = time.time()
duration = end - start
print("Loop Num: ", loop_num)
print("Time Taken:", duration, "Second")

Downloading Data...
URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw XPT/Demographics/Demographic Variables and Sample Weights.XPT

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR1IFF_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw XPT/Dietary/Dietary Interview - Individual Foods, First Day.XPT

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR2IFF_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw XPT/Dietary/Dietary Interview - Individual Foods, Second Day.XPT

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR1TOT_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw XPT/Dietary/Dietary Interview - Total Nutrient Intakes, First Day.XPT

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR2TOT_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw XPT/Dietary/Dietary Interview - Total Nutrient Intakes, Second Day.XPT

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2

In [None]:
start = time.time()
loop_num = 0

csv_paths = []
if(not os.path.exists(base_path + "/Dataset/Raw CSV/")):
  os.mkdir(base_path + "/Dataset/Raw CSV/")

print("Transforming Data Into CSV...")

for xpt in xpt_paths:
  dir_path = base_path + "/Dataset/Raw CSV/" + xpt['name']
  if(not os.path.exists(dir_path)):
    os.mkdir(dir_path)

  file_path = dir_path+"/"+(xpt['titles'].replace("/", " ").strip())+".csv"

  data_df = pd.read_sas(xpt['file_path'])

  data_df.to_csv(file_path, index=False)

  csv_paths.append({'name': xpt['name'], 'titles':xpt['titles'], 'file_path':file_path})
  print("URL: ", url)
  print("File Path: ", file_path)
  print()
  loop_num += 1


end = time.time()
duration = end - start
print("Loop Num: ", loop_num)
print("Time Taken:", duration, "Second")

Transforming Data Into CSV...
URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/WHQMEC_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw CSV/Demographics/Demographic Variables and Sample Weights.csv

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/WHQMEC_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw CSV/Dietary/Dietary Interview - Individual Foods, First Day.csv

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/WHQMEC_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw CSV/Dietary/Dietary Interview - Individual Foods, Second Day.csv

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/WHQMEC_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw CSV/Dietary/Dietary Interview - Total Nutrient Intakes, First Day.csv

URL:  https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/WHQMEC_J.XPT
File Path:  /content/drive/MyDrive/Temp Folder/Dataset/Raw CSV/Dietary/Dietary Interview - Total Nutrient Intakes, Second Day.csv

URL:  https://wwwn.cdc.gov/N

In [None]:
dataset_info_df = pd.DataFrame({'name':[], 'titles':[], 'doc':[], 'xpt':[]})

for dataset in dataset_information:
  temp = pd.DataFrame({'name': dataset['name'], 'titles':dataset['titles'], 'doc': dataset['docs'], 'xpt': dataset['xpts']})
  dataset_info_df = pd.concat([dataset_info_df, temp], axis=0)

dataset_info_df = dataset_info_df.reset_index(drop=True)
dataset_info_df

dataset_info_df = dataset_info_df.merge(pd.DataFrame(xpt_paths), how='left', on=['name', 'titles'])
dataset_info_df = dataset_info_df.rename(columns={'file_path':'xpt_path'})

dataset_info_df = dataset_info_df.merge(pd.DataFrame(csv_paths), how='left', on=['name', 'titles'])
dataset_info_df = dataset_info_df.rename(columns={'file_path':'csv_path'})

dataset_info_df.to_excel(base_path + '/Dataset/' + 'Dataset Info.xlsx', index=False)
dataset_info_df

Unnamed: 0,name,titles,doc,xpt,xpt_path,csv_path
0,Demographics,Demographic Variables and Sample Weights,/Nchs/Nhanes/2017-2018/DEMO_J.htm,/Nchs/Nhanes/2017-2018/DEMO_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
1,Dietary,"Dietary Interview - Individual Foods, First Day",/Nchs/Nhanes/2017-2018/DR1IFF_J.htm,/Nchs/Nhanes/2017-2018/DR1IFF_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
2,Dietary,"Dietary Interview - Individual Foods, Second Day",/Nchs/Nhanes/2017-2018/DR2IFF_J.htm,/Nchs/Nhanes/2017-2018/DR2IFF_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
3,Dietary,"Dietary Interview - Total Nutrient Intakes, Fi...",/Nchs/Nhanes/2017-2018/DR1TOT_J.htm,/Nchs/Nhanes/2017-2018/DR1TOT_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
4,Dietary,"Dietary Interview - Total Nutrient Intakes, Se...",/Nchs/Nhanes/2017-2018/DR2TOT_J.htm,/Nchs/Nhanes/2017-2018/DR2TOT_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
...,...,...,...,...,...,...
121,Questionnaire,Smoking - Recent Tobacco Use,/Nchs/Nhanes/2017-2018/SMQRTU_J.htm,/Nchs/Nhanes/2017-2018/SMQRTU_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
122,Questionnaire,Smoking - Secondhand Smoke Exposure,/Nchs/Nhanes/2017-2018/SMQSHS_J.htm,/Nchs/Nhanes/2017-2018/SMQSHS_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
123,Questionnaire,Volatile Toxicant,/Nchs/Nhanes/2017-2018/VTQ_J.htm,/Nchs/Nhanes/2017-2018/VTQ_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...
124,Questionnaire,Weight History,/Nchs/Nhanes/2017-2018/WHQ_J.htm,/Nchs/Nhanes/2017-2018/WHQ_J.XPT,/content/drive/MyDrive/Temp Folder/Dataset/Raw...,/content/drive/MyDrive/Temp Folder/Dataset/Raw...


In [None]:
end_overall = time.time()
duration = end_overall - start_overall
print("Time Taken:", duration, "Second")

Time Taken: 869.2555108070374 Second
