In [1]:
from sqlalchemy import create_engine
from sqlalchemy.types import INT as sqlINT, VARCHAR as sqlVARCHAR
from urllib.request import urlopen
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import os

Read the database connection details from the repository's `secrets` and create the `engine` that will be used to connect to the database.

In [2]:
host = os.getenv("DATABASE_HOST")
user = os.getenv("DATABASE_USERNAME")
passwd = os.getenv("DATABASE_PASSWORD")
db = os.getenv("DATABASE")

engine = create_engine(
    f'mysql+mysqlconnector://{user}:{passwd}@{host}/{db}',
    echo=False,
    connect_args={'ssl_ca': '/etc/ssl/certs/ca-certificates.crt'}
    )

Access the CDC website and store in a data frame the table that describes the layout of the variables in the ASC file

In [3]:
url = 'https://www.cdc.gov/brfss/annual_data/2019/llcp_varlayout_19_onecolumn.html'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
html_table = soup.find('table',{'class': 'table'})
layout = pd.read_html(StringIO(str(html_table)))[0]
layout

Unnamed: 0,Starting Column,Variable Name,Field Length
0,1,_STATE,2
1,17,FMONTH,2
2,19,IDATE,8
3,19,IMONTH,2
4,21,IDAY,2
...,...,...,...
337,2153,_FRUITE1,1
338,2154,_VEGETE1,1
339,2155,_FLSHOT7,1
340,2156,_PNEUMO3,1


Change the name of the columns in the dataframe to a single word each, then set the column `Variable` as the index of the dataframe

In [4]:
layout.columns = ['Start', 'Code', 'Length']
layout.set_index('Code', inplace=True)

layout.info()

<class 'pandas.core.frame.DataFrame'>
Index: 342 entries, _STATE to _AIDTST4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Start   342 non-null    int64
 1   Length  342 non-null    int64
dtypes: int64(2)
memory usage: 8.0+ KB


Show the head and tail of the dataframe

In [5]:
layout

Unnamed: 0_level_0,Start,Length
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
_STATE,1,2
FMONTH,17,2
IDATE,19,8
IMONTH,19,2
IDAY,21,2
...,...,...
_FRUITE1,2153,1
_VEGETE1,2154,1
_FLSHOT7,2155,1
_PNEUMO3,2156,1


Get the maximum length of all the strings in the index called `Variable` of the `layout` dataframe to get an idea of the most appropriate lenght for the data types used in the `to_sql` method.

In [6]:
print('Max column length: ', layout.index.str.len().max())

Max column length:  8


Upload the dataframe to a MySQL database using the `to_sql` method and the `sqlalchemy` library, then, read the table from the database and show the head and tail of the dataframe to check if the upload was successful.

In [7]:
column_types = {
    'Code': sqlVARCHAR(10),
    'Start': sqlINT,
    'Length': sqlINT
}

pd.io.sql.to_sql(layout, 'layout', engine, if_exists='replace', index=True, flavor='mysql', dtype=column_types)
pd.io.sql.read_sql('SELECT * FROM layout', engine,index_col='Code')

Unnamed: 0_level_0,Start,Length
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
_STATE,1,2
FMONTH,17,2
IDATE,19,8
IMONTH,19,2
IDAY,21,2
...,...,...
_FRUITE1,2153,1
_VEGETE1,2154,1
_FLSHOT7,2155,1
_PNEUMO3,2156,1
