In [1]:
from sqlalchemy import create_engine
from sqlalchemy.types import INT as sqlINT, VARCHAR as sqlVARCHAR
from zipfile import ZipFile
from io import StringIO
import pandas as pd
import requests
import os

Read the database connection details from the repository's `secrets` and create the `engine` that will be used to connect to the database.

In [2]:
host = os.getenv("DATABASE_HOST")
user = os.getenv("DATABASE_USERNAME")
passwd = os.getenv("DATABASE_PASSWORD")
db = os.getenv("DATABASE")

engine = create_engine(
    f'mysql+mysqlconnector://{user}:{passwd}@{host}/{db}',
    echo=False,
    connect_args={'ssl_ca': '/etc/ssl/certs/ca-certificates.crt'}
    )

Download the zip file that contains the data from the CDC's website to the `/data/` directory.

In [3]:
if not os.path.exists('../data/LLCP2019ASC.zip'):
    url = 'https://www.cdc.gov/brfss/annual_data/2019/files/LLCP2019ASC.zip'
    response = requests.get(url, stream=True)
    with open('../data/LLCP2019ASC.zip', 'wb') as f:
        for chunk in response.iter_content(chunk_size=512):
            if chunk: 
                f.write(chunk)
    # Delete the `response` variable to avoid memory issues.
    del response

Read the zip file and extract each line of the file into a list of strings to later convert it to a dataframe.

In [4]:
data_rows = []
asc_zip = ZipFile('../data/LLCP2019ASC.zip', 'r')
file_in_zip = asc_zip.namelist()[0]
for line in asc_zip.open(file_in_zip).readlines():
    data_rows.append(line.decode('utf-8'))
asc_zip.close()

Delete the `asc_zip` and `file_in_zip` variables to avoid memory issues.

In [5]:
del asc_zip
del file_in_zip

Convert the list of strings into a dataframe and show the head and tail of the dataframe to understand its structure.

In [6]:
data = pd.DataFrame(data_rows,columns=['raw'])
data

Unnamed: 0,raw
0,01 0101182019 11002019000001 ...
1,01 0101132019 11002019000002 ...
2,01 0101182019 11002019000003 ...
3,01 0101182019 12002019000004 ...
4,01 0101042019 11002019000005 ...
...,...
418263,72 0903152020 11002019006029 ...
418264,72 0903082020 11002019006030 ...
418265,72 0903102020 11002019006031 ...
418266,72 0903062020 11002019006032 ...


Once we've seen how the data would be structured in a dataframe, delete the `data` variable to avoid memory issues.

In [7]:
del(data)

#### Retreive the metadata from the MySQL database

Query the MySQL database to retrieve the layout table, which describes the structure of the columns. This is required to perform an action similar to the MS Excel action "Text to Columns" to split the raw data (strings) in the dataframe into multiple columns.

_Note:_ An `if` condition is used to check if the query has already been executed. This is done to avoid exhausting the amount of free queries per month provided by the MySQL database provider. 

In [8]:
if 'layout' not in locals():
    layout = pd.io.sql.read_sql('SELECT * FROM layout', engine,index_col='Code')
layout

Unnamed: 0_level_0,Start,Length
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
_STATE,1,2
FMONTH,17,2
IDATE,19,8
IMONTH,19,2
IDAY,21,2
...,...,...
_FRUITE1,2153,1
_VEGETE1,2154,1
_FLSHOT7,2155,1
_PNEUMO3,2156,1


Query the MySQL database to retrieve the attributes table which, among other details, contains the section name of each 'Code'. 

_Note:_ An `if` condition is used to check if the query has already been executed. This is done to avoid exhausting the amount of free queries per month provided by the MySQL database provider. 

In [9]:
if 'attributes' not in locals():
    attributes = pd.io.sql.read_sql('SELECT * FROM attributes', engine, index_col='Code')

attributes.sample(3)

Unnamed: 0_level_0,Label,Section_Name,Section_Number,Question_Number,Column,Type_of_Variable,Question_Prologue,Question,Core_Section_Number,Module_Number
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LANDLINE,Do you also have a landline telephone?,Cell Phone Introduction,0.0,10,88,Num,Variable only on the cell phone survey,Do you also have a landline telephone in your ...,,
CSRVDOC1,What Type of Doctor Provides Majority of Your ...,Cancer Survivorship,,5,347-348,Num,,What type of doctor provides the majority of y...,,13.0
CRGVLNG1,How Long Provided Care For Person.,Caregiver,,3,386,Num,,For how long have you provided care for that p...,,21.0


Build the list of codes which their column 'Section_name' is equal to 'Record Identification'.

In [10]:
id_list = attributes[attributes['Section_Name']=='Record Identification'].index.get_level_values(0).tolist()
pd.DataFrame(
    ([code, attributes.loc[code,'Type_of_Variable'],layout.loc[code,'Length']] for code in id_list), 
    columns=['Code','Type','Length']
    ).set_index('Code')

Unnamed: 0_level_0,Type,Length
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
_STATE,Num,2
FMONTH,Num,2
IDATE,Char,8
IMONTH,Char,2
IDAY,Char,2
IYEAR,Char,4
DISPCODE,Num,4
SEQNO,Char,10
_PSU,Num,10


#### Build the dataframe from the list of rows

Split the raw data into multiple columns according to the `variables_layout` dataframe.

This works but doesn't read all the rows

In [12]:
colspecs = [(layout.loc[code,'Start']-1, layout.loc[code,'Start']-1+layout.loc[code,'Length']) for code in layout.index.get_level_values(0)]
nrows = 10000
n = 0
chunks = []

while True:
    i = nrows * n
    j = nrows * (n + 1)
    rows = data_rows[i:j]
    n += 1
    if not rows:
        break
    
    file = StringIO('\n'.join(rows))
    file.seek(0)

    chunk = pd.read_fwf(file, names=layout.index.get_level_values(0).tolist(), colspecs=colspecs, header=None, nrows=nrows)
    chunks.append(chunk)

data = pd.concat(chunks, ignore_index=True)
data


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1,1,1182019,1,18,2019,1100,2019000001,2019000001,1.0,...,114.0,1,1,1,1,0,0,2.0,1.0,2.0
1,1,1,1132019,1,13,2019,1100,2019000002,2019000002,1.0,...,121.0,1,1,1,1,0,0,1.0,1.0,2.0
2,1,1,1182019,1,18,2019,1100,2019000003,2019000003,1.0,...,164.0,1,1,1,1,0,0,1.0,2.0,2.0
3,1,1,1182019,1,18,2019,1200,2019000004,2019000004,1.0,...,,9,9,1,1,1,1,9.0,9.0,
4,1,1,1042019,1,4,2019,1100,2019000005,2019000005,1.0,...,178.0,1,1,1,1,0,0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210037,72,4,9282019,9,28,2019,1100,2019002762,2019002762,,...,30.0,2,2,1,1,0,0,,,2.0
210038,72,4,9222019,9,22,2019,1100,2019002763,2019002763,,...,57.0,2,2,1,1,0,0,,,1.0
210039,72,4,9092019,9,9,2019,1100,2019002764,2019002764,,...,92.0,1,2,1,1,0,0,,,1.0
210040,72,4,9142019,9,14,2019,1100,2019002765,2019002765,,...,164.0,1,1,1,1,0,0,2.0,1.0,2.0


This works but doesn't read all the rows either

In [None]:
asc_zip = ZipFile('../data/LLCP2019ASC.zip', 'r')
file_in_zip = asc_zip.namelist()[0]
colspecs = [(layout.loc[code,'Start']-1, layout.loc[code,'Start']-1+layout.loc[code,'Length']) for code in layout.index.get_level_values(0)]
nrows = 10000
data = pd.DataFrame()

with asc_zip.open(file_in_zip) as f:
    chunks = []
    while True:
        chunk = pd.read_fwf(f, names=layout.index.get_level_values(0).tolist(), colspecs=colspecs, header=None, nrows=nrows)
        chunks.append(chunk)
        if chunk.empty:
            break
    data = pd.concat(chunks, ignore_index=True)

asc_zip.close()

In [None]:
data

#### Upload the processed data to the database

Use the list of codes thacan be passed as an index dataframe resulting from splitting the raw data into different named columns into the MySQL database.