In [1]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

In [2]:
data_rows = []
resp = urlopen('https://www.cdc.gov/brfss/annual_data/2019/files/LLCP2019ASC.zip')
asc_zip = ZipFile(BytesIO(resp.read()))
filename = asc_zip.namelist()[0]
for line in asc_zip.open(filename).readlines():
    data_rows.append(line.decode('utf-8'))

In [3]:
data = pd.DataFrame(data_rows,columns=['raw'])
print(data)

                                                      raw
0       01              0101182019     11002019000001 ...
1       01              0101132019     11002019000002 ...
2       01              0101182019     11002019000003 ...
3       01              0101182019     12002019000004 ...
4       01              0101042019     11002019000005 ...
...                                                   ...
418263  72              0903152020     11002019006029 ...
418264  72              0903082020     11002019006030 ...
418265  72              0903102020     11002019006031 ...
418266  72              0903062020     11002019006032 ...
418267  72              0903052020     11002019006033 ...

[418268 rows x 1 columns]


In [4]:
del(data)

In [5]:
# Access the CDC website and store in a data frame the table that describes the layout of the variables in the ASC file
url = 'https://www.cdc.gov/brfss/annual_data/2019/llcp_varlayout_19_onecolumn.html'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
html_table = soup.find('table',{'class': 'table'})
layout = pd.read_html(StringIO(str(html_table)))[0]
layout.set_index('Variable Name',inplace=True)

In [6]:
print(layout)
print('-'*45)
print(layout.info())

               Starting Column  Field Length
Variable Name                               
_STATE                       1             2
FMONTH                      17             2
IDATE                       19             8
IMONTH                      19             2
IDAY                        21             2
...                        ...           ...
_FRUITE1                  2153             1
_VEGETE1                  2154             1
_FLSHOT7                  2155             1
_PNEUMO3                  2156             1
_AIDTST4                  2157             1

[342 rows x 2 columns]
---------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 342 entries, _STATE to _AIDTST4
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Starting Column  342 non-null    int64
 1   Field Length     342 non-null    int64
dtypes: int64(2)
memory usage: 8.0+ KB
None


In [7]:
data_dict = {}
for index, row in layout.iterrows():
    i = row['Starting Column']-1
    j = row['Starting Column']+row['Field Length']-1
    rows_in_column = []
    for row in data_rows:
        rows_in_column.append(row[i:j])
    data_dict[index] = rows_in_column

data = pd.DataFrame(data_dict)
data

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_FLSHOT7,_PNEUMO3,_AIDTST4
0,01,01,01182019,01,18,2019,1100,2019000001,2019000001,1,...,000114,1,1,1,1,0,0,2,1,2
1,01,01,01132019,01,13,2019,1100,2019000002,2019000002,1,...,000121,1,1,1,1,0,0,1,1,2
2,01,01,01182019,01,18,2019,1100,2019000003,2019000003,1,...,000164,1,1,1,1,0,0,1,2,2
3,01,01,01182019,01,18,2019,1200,2019000004,2019000004,1,...,,9,9,1,1,1,1,9,9,
4,01,01,01042019,01,04,2019,1100,2019000005,2019000005,1,...,000178,1,1,1,1,0,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418263,72,09,03152020,03,15,2020,1100,2019006029,2019006029,,...,000043,1,2,1,1,0,0,2,2,2
418264,72,09,03082020,03,08,2020,1100,2019006030,2019006030,,...,000142,1,1,1,1,0,0,,,2
418265,72,09,03102020,03,10,2020,1100,2019006031,2019006031,,...,000055,1,2,1,1,0,0,,,1
418266,72,09,03062020,03,06,2020,1100,2019006032,2019006032,,...,000214,1,1,1,1,0,0,2,2,2
