# DATA PROJECT

In [1]:
import re
import time
import requests
import numpy as np
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from matplotlib import pyplot as plt, rcParams
from selenium import webdriver
import pandas as pd
from pydataset import data

# Gathering the data

The data I am using was available on kaggle in csv form. I loaded all of the csv files and merged the data into a single data frame sorted by sequence number. This data has results from surveying 9813 people on 1812 different questions. 

In [83]:
var_info["WHD080L"]

'How did {you/SP} try to lose weight?'

In [152]:
all_data = {}

data_files = ["demographic","diet","examination", "labs", "questionnaire"]
for filename in data_files:
    data = pd.read_csv(filename + ".csv")
    all_data[filename] = data

together = pd.merge(all_data[data_files[0]],all_data[data_files[1]],on="SEQN")
for j in range(2,len(data_files)):
    together = pd.merge(together,all_data[data_files[j]])

together.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,WHD080U,WHD080L,WHD110,WHD120,WHD130,WHD140,WHQ150,WHQ030M,WHQ500,WHQ520
0,73557,8,2,1,69,,4,4,1.0,,...,,40.0,270.0,200.0,69.0,270.0,62.0,,,
1,73558,8,2,1,54,,3,3,1.0,,...,,,240.0,250.0,72.0,250.0,25.0,,,
2,73559,8,2,1,72,,3,3,2.0,,...,,,180.0,190.0,70.0,228.0,35.0,,,
3,73560,8,2,1,9,,3,3,1.0,119.0,...,,,,,,,,3.0,3.0,3.0
4,73561,8,2,2,73,,3,3,1.0,,...,,,150.0,135.0,67.0,170.0,60.0,,,


# Scraping details on variables

The variable names in the data were extremely ambiguous so I scraped descriptions for the variable names from NHANES official site (centers for disease control and prevention). Along with the variable name descriptions was a classification relating to what type of data the variable represented. I also scraped this classification and stored all variables within a specific classification as values in a dictionary corresponding to the classification as the key. I also created two dictionaries, one to store descriptions of variable names and another to store descriptions of catagories. 

In [154]:
def get_variable_info():
    """Crawl through the NHANES site to get variable names and descriptions for data."""

    variable_info = {}
    catagory_info = {}
    
    sorted_variables = {}
    
    to_drop = []
    find_drop = re.compile(r'(Age in months|How did|[Ww]as the|Armed Forces|National Guard)')
    var_drop = re.compile(r'^(WHQ|SX|SMQ|DR[12]IFF_H|DSBI|DSPI|DXX)')
    
    datatype_finder = re.compile(r"^(Demographics|Dietary|Examination|Laboratory|Questionnaire)$")
    varpage_finder = re.compile(r"^NHANES 2013-2014.*Variable List$")
    the_base = "https://wwwn.cdc.gov"
    base_url = "https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2013"      
    base_soup = BeautifulSoup(requests.get(base_url).text, "html.parser")
    page_tags = base_soup.find_all(name='a', href=True, string=datatype_finder)
    pages = [the_base + tag.attrs["href"] for tag in page_tags]
    
    for page in pages:
        variable_names = []
        variable_descriptions = []
        end = page.find('&')
        key = page[64:end]
        time.sleep(1)           
        soup = BeautifulSoup(requests.get(page).text, "html.parser")
        new_base = the_base + "/nchs/nhanes/search/"
        next_page = new_base + soup.find_all(name="a", string=varpage_finder, href=True)[0].attrs["href"]
        
        time.sleep(1)
        new_soup = BeautifulSoup(requests.get(next_page).text, "html.parser")
        names = new_soup.find_all(name="tr")[2:]

        for name in names:
            temp = name.find_all(name="td")
            varname = temp[0].text
            desc = temp[1].text
            subcat_name = temp[2].text
            subcat_desc = temp[3].text
            variable_info[varname] = desc
            catagory_info[subcat_name] = subcat_desc
            
            if varname != "SEQN":
                if subcat_name in sorted_variables.keys():
                    sorted_variables[subcat_name].append(varname)
                else: 
                    sorted_variables[subcat_name] = [varname]
            
            if re.search(find_drop, desc) or re.search(var_drop, varname):
                to_drop.append(varname)

    return variable_info, catagory_info, sorted_variables, to_drop

var_info, cat_info, sorted_vars, to_drop = get_variable_info()

# Cleaning the data

In the variable information scraping, I also created a list of columns that are completely irrelevant to this project. I will drop these column from the data to simplify analysis in the future.

In [155]:
to_drop = [c.upper() for c in to_drop if c.upper() in together.columns]

together.drop(to_drop,axis=1, inplace=True)
together.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDRETH1,RIDRETH3,RIDEXMON,DMDBORN4,DMDCITZN,...,SMAQUEX.x,SMDANY,SMAQUEX.y,WHD010,WHD020,WHD050,WHD110,WHD120,WHD130,WHD140
0,73557,8,2,1,69,4,4,1.0,1,1.0,...,2.0,1.0,2.0,69.0,180.0,210.0,270.0,200.0,69.0,270.0
1,73558,8,2,1,54,3,3,1.0,1,1.0,...,2.0,1.0,2.0,71.0,200.0,160.0,240.0,250.0,72.0,250.0
2,73559,8,2,1,72,3,3,2.0,1,1.0,...,2.0,2.0,2.0,70.0,195.0,195.0,180.0,190.0,70.0,228.0
3,73560,8,2,1,9,3,3,1.0,1,1.0,...,3.0,,,,,,,,,
4,73561,8,2,2,73,3,3,1.0,1,1.0,...,2.0,2.0,2.0,67.0,120.0,150.0,150.0,135.0,67.0,170.0


In [71]:
print(cat_info["DPQ_H"])
print(cat_info["DPQY_H_R"])
print(len(sorted_vars["DPQ_H"] + sorted_vars["DPQY_H_R"]))

Mental Health - Depression Screener
Mental Health - Depression Screener - Youth
20


# Commentary on the data

I believe this sorce to be reliable. National Health and Nutrition Examination Survey (NHANES) is a program of the National Center for Health Statistics which is part of the Center for Disease Control and Prevention which is responsible for providing health statistics for the nation. This program (NHANES) has been around for over 50 years and has been consistantly producing survey results for 20. 

The data includes demographic, socioeconomic, dietary, and health related questions. It also includes 20 questions relating to mental health of adults and youth. 