In [1]:
from urllib.request import urlopen
from unicodedata import normalize
from io import StringIO
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = 'https://www.cdc.gov/brfss/annual_data/2019/pdf/codebook19_llcp-v2-508.HTML'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
html_tables = soup.findAll('table',{'class': 'table'})

Transform all the tables found in the codebook url to a pandas dataframe.

In [3]:
data_attributes_dict = {}
value_catalogs = pd.DataFrame()

for table in html_tables[1:]:
    # Find the first cell in the table
    attributes_html = table.find('td')
    # Extract the text from the cell
    attributes_text = normalize('NFKD',attributes_html.get_text(separator='\n'))
    # Split the text into lines and create a dictionary that will be used
    #  build a dataframe to store the attributes of the SAS variable
    d = dict()
    for line in attributes_text.splitlines():
        k = str(line.split(':')[0]).strip()
        v = str(line.split(':')[1]).strip()
        d[k] = v
    
    # Skip to the next table if the cell doesn't contain a 'SAS Variable Name', 
    if 'SAS Variable Name' not in d:
        continue
    
    # Extract the SAS Variable Name and remove it from the dictionary
    #  otherwise, it would be used as a column in the resulting dataframe
    code = d.pop('SAS Variable Name')

    # Use the variable name as the key for the dictionary
    data_attributes_dict[code] = d

    # Remove the first row from the table
    table.find('tr').decompose()
    # Remove the colgroup from the table
    table.find('colgroup').decompose()
    
    # Read the table into a dataframe
    df = pd.read_html(StringIO(str(table)))[0]
    # Add the `code` to all the rows in the current iteration of the dataframe
    #  in a new column
    df['Code'] = code
    # Set the index of the dataframe to the `code` and `value` columns
    df.set_index(['Code','Value'],inplace=True)
    # Concatenate the dataframe to the value_catalogs dataframe
    value_catalogs = pd.concat([value_catalogs,df])

Display the info of the dataframe that was built from all the tables in the codebook url.

In [4]:
print(value_catalogs.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1942 entries, ('_STATE', 1) to ('_AIDTST4', 'BLANK')
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Value Label          1890 non-null   object
 1   Frequency            1942 non-null   int64 
 2   Percentage           1942 non-null   object
 3   Weighted Percentage  1942 non-null   object
dtypes: int64(1), object(3)
memory usage: 88.9+ KB
None


Modify the values in the `Weighted Percentage` and `Percentage` columns in the dataframe from '.' to NaN and all the other values to float.

In [5]:
#value_catalogs = value_catalogs.apply(pd.to_numeric, errors='ignore') # Alternative way to convert to numeric
value_catalogs.replace({'Percentage':{'.': np.nan}}, inplace=True)
value_catalogs.replace({'Weighted Percentage':{'.': np.nan}}, inplace=True)
value_catalogs = value_catalogs.astype({'Percentage': float, 'Weighted Percentage': float})
value_catalogs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1942 entries, ('_STATE', 1) to ('_AIDTST4', 'BLANK')
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Value Label          1890 non-null   object 
 1   Frequency            1942 non-null   int64  
 2   Percentage           1669 non-null   float64
 3   Weighted Percentage  1669 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 88.9+ KB


Count how many unique values exist in the `Code` index of the `value_catalogs` dataframe.

In [6]:
value_catalogs.index.get_level_values(0).unique()

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENM1',
       ...
       '_VEGESU1', '_FRTLT1A', '_VEGLT1A', '_FRT16A', '_VEG23A', '_FRUITE1',
       '_VEGETE1', '_FLSHOT7', '_PNEUMO3', '_AIDTST4'],
      dtype='object', name='Code', length=342)

Filter the `value_catalog` dataframe to find only rows where, when grouped by Code, the last item in the catalog of responses contributes no more than 30% of the total responses.

This is done because the last item in the catalog of responses is usually reserved for answers like 'Unknown' or 'Not asked, or 'Missing'. If the last item in the catalog of responses is greater than 30%, it is likely that the last item is a 'catch-all' for these types of responses. 

In [8]:
promising_codes = []
for code in value_catalogs.index.get_level_values(0).unique():
    freq = value_catalogs.loc[code]['Frequency']
    if freq.iloc[-1]/freq.sum() < 0.30:
        promising_codes.append(code)

print(len(promising_codes))

131


#### Examples:

Example of how to filter the dataframe `value_catalogs` to find the meaning of the responses to the question that has the code `_STATE`.

In [16]:
pd.DataFrame(value_catalogs.loc['_STATE']['Value Label'].head())

Unnamed: 0_level_0,Value Label
Value,Unnamed: 1_level_1
1,Alabama
2,Alaska
4,Arizona
5,Arkansas
6,California


Example of how to filter the dataframe `value_catalogs` to find the meaning of the response `'3'` to the question that has the code `GENHLTH` .

In [11]:
value_catalogs.loc['GENHLTH']['Value Label'].loc['3']

'Good'

Example of how to filter the dataframe `value_catalogs` to find the meaning of all the possible responses to the question that has the code `HLTHPLN1` .

In [15]:
pd.DataFrame(value_catalogs.loc['HLTHPLN1']['Value Label'])

Unnamed: 0_level_0,Value Label
Value,Unnamed: 1_level_1
1,Yes
2,No
7,Don’t know/Not Sure
9,Refused
BLANK,Not asked or Missing
