In [1]:
from urllib.request import urlopen
from unicodedata import normalize
from io import StringIO
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
data_attributes_dict = {}
value_catalogs = pd.DataFrame()

url = 'https://www.cdc.gov/brfss/annual_data/2019/pdf/codebook19_llcp-v2-508.HTML'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
html_tables = soup.findAll('table',{'class': 'table'})

In [3]:
for table in html_tables[1:]:
    attributes_html = table.find('td')
    attributes_text = normalize('NFKD',attributes_html.get_text(separator='\n'))
    d = dict()
    for line in attributes_text.splitlines():
        k = str(line.split(':')[0]).strip()
        v = str(line.split(':')[1]).strip()
        d[k] = v
    
    if 'SAS Variable Name' not in d:
        continue
    
    code = d['SAS Variable Name']
    d.pop('SAS Variable Name')
    data_attributes_dict[code] = d

    table.find('tr').decompose()
    table.find('colgroup').decompose()
    
    value_catalog_df = pd.read_html(StringIO(str(table)))[0]
    value_catalog_df['Code'] = code
    value_catalog_df.set_index(['Code','Value'],inplace=True)
    value_catalogs = pd.concat([value_catalogs,value_catalog_df])

In [4]:
print(value_catalogs.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1942 entries, ('_STATE', 1) to ('_AIDTST4', 'BLANK')
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Value Label          1890 non-null   object
 1   Frequency            1942 non-null   int64 
 2   Percentage           1942 non-null   object
 3   Weighted Percentage  1942 non-null   object
dtypes: int64(1), object(3)
memory usage: 88.9+ KB
None


In [5]:
#value_catalogs = value_catalogs.apply(pd.to_numeric, errors='ignore') # Alternative way to convert to numeric
value_catalogs.replace({'Percentage':{'.': np.nan}}, inplace=True)
value_catalogs.replace({'Weighted Percentage':{'.': np.nan}}, inplace=True)
value_catalogs = value_catalogs.astype({'Percentage': float, 'Weighted Percentage': float})
value_catalogs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1942 entries, ('_STATE', 1) to ('_AIDTST4', 'BLANK')
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Value Label          1890 non-null   object 
 1   Frequency            1942 non-null   int64  
 2   Percentage           1669 non-null   float64
 3   Weighted Percentage  1669 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 88.9+ KB


In [6]:
value_catalogs.index.get_level_values(0).unique()

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENM1',
       ...
       '_VEGESU1', '_FRTLT1A', '_VEGLT1A', '_FRT16A', '_VEG23A', '_FRUITE1',
       '_VEGETE1', '_FLSHOT7', '_PNEUMO3', '_AIDTST4'],
      dtype='object', name='Code', length=342)

In [7]:
valid_codes = []
for code in value_catalogs.index.get_level_values(0).unique():
    freq = value_catalogs.loc[code]['Frequency']
    if freq.iloc[-1]/freq.sum() < 0.30:
        valid_codes.append(code)

# Filter value_catalogs to only include only rows with an index 'Code' that exists in 'valid_codes'
value_catalogs = value_catalogs.loc[valid_codes]
value_catalogs.index.get_level_values(0).unique()

Index(['_STATE', 'FMONTH', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'GENHLTH',
       'PHYSHLTH', 'MENTHLTH', 'HLTHPLN1',
       ...
       'VEGEDA2_', '_MISFRT1', '_MISVEG1', '_FRUTSU1', '_VEGESU1', '_FRTLT1A',
       '_VEGLT1A', '_FRUITE1', '_VEGETE1', '_AIDTST4'],
      dtype='object', name='Code', length=131)

In [8]:
data_attributes = pd.DataFrame(data_attributes_dict).T.loc[valid_codes]
data_attributes

Unnamed: 0,Label,Section Name,Section Number,Question Number,Column,Type of Variable,Question Prologue,Question,Core Section Number,Module Number
_STATE,State FIPS Code,Record Identification,0,1,1-2,Num,,State FIPS Code,,
FMONTH,File Month,Record Identification,0,8,17-18,Num,,File Month,,
IMONTH,Interview Month,Record Identification,0,10,19-20,Char,,Interview Month,,
IDAY,Interview Day,Record Identification,0,11,21-22,Char,,Interview Day,,
IYEAR,Interview Year,Record Identification,0,12,23-26,Char,,Interview Year,,
...,...,...,...,...,...,...,...,...,...,...
_FRTLT1A,Consume Fruit 1 or more times per day,Calculated Variables,,13,2149,Num,,Consume Fruit 1 or more times per day,,12
_VEGLT1A,Consume Vegetables 1 or more times per day,Calculated Variables,,14,2150,Num,,Consume Vegetables 1 or more times per day,,12
_FRUITE1,Fruit Exclusion from analyses,Calculated Variables,,17,2153,Num,,Fruit Exclusion from analyses,,12
_VEGETE1,Vegetable Exclusion from analyses,Calculated Variables,,18,2154,Num,,Vegetable Exclusion from analyses,,12


Example of how to filter `value_catalogs` using a value of the index `Code`.

In [10]:
value_catalogs.loc['_STATE'].head()

Unnamed: 0_level_0,Value Label,Frequency,Percentage,Weighted Percentage
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Alabama,7052,1.69,1.51
2,Alaska,2977,0.71,0.22
4,Arizona,8941,2.14,2.24
5,Arkansas,5359,1.28,0.92
6,California,11613,2.78,12.22
