# Transform CDC Downloaded Excel Files to CSV File 

Perform imports

In [1]:
import pandas as pd
import os
import re
import datetime as dt

In [2]:
# Constant regex pattern used for file name parsing
pattern = re.compile('(Community_Profile_Report_)(\d{4})(\d{2})(\d{2})(.*.xlsx)', re.IGNORECASE)

'''Function which will extract the file date from the file name.  This will be added as a feature to the data set'''
def extract_date(file_name):
    match = pattern.findall(file_name)
    if len(match) > 0:
        file_name_parts = match[0]
        if len(file_name_parts) > 0:
            print(f'{file_name_parts[1]}-{file_name_parts[2]}-{file_name_parts[3]}')
            return dt.date(int(file_name_parts[1]), int(file_name_parts[2]), int(file_name_parts[3]))  
    return None

Take file names from download directory list

In [3]:
file_names = os.listdir('../download')


Iterate through files to extract Counties worksheet data to concat to a pandas dataframe.  Note that we are only getting data from 2021.  This was a decision by the project team.

In [5]:

# Note that we are only getting data from 2021

counter = 0
for file_name in file_names:
    file_date = extract_date(file_name)
    if file_date.year == 2021: 
        counties_excel = pd.read_excel(f'../download/{file_name}', sheet_name='Counties', skiprows = 1)
        counties_excel['File Date'] = file_date
        counties_excel['County only'] = counties_excel['County'].map(lambda x: x.split(',')[0])
        if counter ==0:
            counties_df = pd.DataFrame(data=counties_excel, columns = counties_excel.columns)
            #print(counties_excel.shape)  # for debug
            #print(counties_df.columns.names)
            counter = 1
        else:        
            counties_df = pd.concat([counties_excel,counties_df], ignore_index=True)
            #print(counties_df.shape) # for debug
    
counties_df.shape

2021-03-09
2021-01-30
2021-05-02
2021-02-07
2020-12-26
2021-03-15
2021-01-02
2021-04-22
2021-01-17
2021-02-12
2021-04-19
2021-04-05
2021-03-30
2021-01-25
2021-05-17
2021-02-20
2021-03-05
2021-02-17
2020-12-18
2021-05-20
2021-01-12
2021-02-25
2021-05-12
2021-01-20
2021-03-19
2021-04-15
2021-03-22
2021-04-27
2021-01-07
2021-03-11
2020-12-23
2021-03-10
2021-04-09
2021-01-29
2021-02-02
2020-12-20
2021-02-01
2021-05-18
2021-01-04
2021-01-18
2021-05-04
2021-04-16
2021-01-23
2021-05-11
2021-02-08
2020-12-29
2021-02-26
2021-03-28
2021-01-11
2021-02-14
2021-04-28
2021-03-31
2021-01-08
2021-02-23
2021-04-06
2021-01-26
2021-02-11
2021-03-03
2020-12-30
2021-01-14
2021-03-21
2021-04-21
2021-02-04
2021-03-16
2021-04-13
2021-02-18
2020-12-17
2021-01-06
2021-04-26
2021-02-03
2021-01-28
2021-04-08
2020-12-22
2021-04-14
2021-05-06
2021-03-23
2021-02-24
2021-03-18
2021-04-01
2021-01-21
2021-05-13
2021-02-16
2021-03-14
2021-03-04
2021-01-13
2020-12-19
2021-05-16
2021-01-24
2021-02-21
2021-01-16
2021-04-18

(405679, 104)

In [7]:
counties_df.head()

Unnamed: 0,County,FIPS code,County type,CBSA,CBSA type,State Abbreviation,FEMA region,Population,Population as a percent of CBSA,Population as a percent of state,...,Total RT-PCR diagnostic tests - % change (may be an underestimate due to delayed reporting).1,Testing latency - absolute change.1,% tests resulted in 3 or fewer days - absolute change.1,Viral (RT-PCR) lab test positivity rate - 15-21 days ago (may be an underestimate due to delayed reporting),Total RT-PCR diagnostic tests - 15-21 days ago (may be an underestimate due to delayed reporting),RT-PCR tests per 100k - 15-21 days ago (may be an underestimate due to delayed reporting),Median test latency - 15-21 days ago,% tests resulted in 3 or fewer days - 15-21 days ago,% Native American / Alaskan Native,% Asian
0,"Unallocated, MI",26000,,,,MI,5,,,,...,,,,,,,,,,
1,"Miami-Dade County, FL",12086,Large central metro,"Miami-Fort Lauderdale-Pompano Beach, FL",Metropolitan,FL,4,2716940.0,0.4406,0.1265,...,,,,,,,,,,
2,"Cook County, IL",17031,Large central metro,"Chicago-Naperville-Elgin, IL-IN-WI",Metropolitan,IL,5,5150233.0,0.5445,0.4064,...,,,,,,,,,,
3,"Wayne County, MI",26163,Large central metro,"Detroit-Warren-Dearborn, MI",Metropolitan,MI,5,1749343.0,0.405,0.1752,...,,,,,,,,,,
4,"Kings County, NY",36047,Large central metro,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan,NY,2,2559903.0,0.1332,0.1316,...,,,,,,,,,,


In [8]:
counties_df['File Date'].value_counts().shape

(124,)

# Save Counties Information to CSV

In [9]:
counties_df.to_csv('../data/cdc_counties.tmp.csv', index=False)