In [77]:
import pandas as pd
import numpy as np

In [78]:
pandemic = pd.read_csv("final/Pandemic_v1.csv")
prepandemic = pd.read_csv("final/prepandemic_v1.csv")
unemployment = pd.read_csv("final/Unemployment_v1.csv")

# Remove all Puerto Rico data
pandemic = pandemic.drop(pandemic[pandemic['Area name'] == 'Bedford city'].index)

In [79]:
# Need state and county name
state = pandemic['stname']
county = pandemic['ctyname']

# The confirmed cases and death number is accumulated, used the last available date data
confirmed_cases = pandemic['confirmed_cases_20200418']
deaths = pandemic['deaths_20200418']

In [80]:
# Use year 2018(closest) population density
populationdensity=prepandemic['popdensity_2018']
populationdensity_min = populationdensity.min()
populationdensity_mean = populationdensity.mean()
populationdensity_stage = populationdensity_mean - populationdensity_min
def populationdensity_category(populationdensity):
    if populationdensity < populationdensity_mean - populationdensity_stage * 0.9: 
        return '1_Very Low' 
    elif populationdensity < populationdensity_mean - populationdensity_stage * 0.3: 
        return '2_Low'  
    elif populationdensity < populationdensity_mean + populationdensity_stage * 0.3: 
        return '3_Medium'   
    elif populationdensity < populationdensity_mean + populationdensity_stage: 
        return '4_High' 
    else:
        return 'Very_High'
populationdensity_category = prepandemic.apply(lambda x: populationdensity_category(x['popdensity_2018']), axis = 1)

In [84]:
# Get top 3 most populations ethnicity group in the county
def ethnic_group_top(hispanic, white, black, indian, asian, hawaii_na, top):
    ethnic_groups = [('Hispanic', hispanic), ('White', white), ('Black', black), ('Indian', indian), ('Asian', asian), ('Hawaii and N/A', hawaii_na)]
    ethnic_groups = sorted(ethnic_groups, key = lambda x: -x[1])
    return ethnic_groups[top - 1][0]

ethnic_group_top1 = prepandemic.apply(lambda x: ethnic_group_top(x['Hispanic2018'], x['nH_White_2018'], x['nH_Black_2018'], x['nH_Indian_Na_2018'], x['nH_Asian_2018'], x['nH_Hawaii_Na_2018'], 1), axis = 1)
ethnic_group_top2 = prepandemic.apply(lambda x: ethnic_group_top(x['Hispanic2018'], x['nH_White_2018'], x['nH_Black_2018'], x['nH_Indian_Na_2018'], x['nH_Asian_2018'], x['nH_Hawaii_Na_2018'], 2), axis = 1)
ethnic_group_top3 = prepandemic.apply(lambda x: ethnic_group_top(x['Hispanic2018'], x['nH_White_2018'], x['nH_Black_2018'], x['nH_Indian_Na_2018'], x['nH_Asian_2018'], x['nH_Hawaii_Na_2018'], 3), axis = 1)

In [86]:
# Age data has 18 columns, we group them to children, young adult, middle adult, senior
# column1:  age 0-4 prop.
# column2:  age 5-9 prop.
# column3:  age 10-14 prop.
# column4:  age 15-19 prop.
# column5:  age 20-24 prop.
# column6:  age 25-29 prop.
# column7:  age 30-34 prop.
# column8:  age 35-39 prop.
# column9:  age 40-44 prop.
# column10: age 45-49 prop.
# column11: age 50-54 prop.
# column12: age 55-59 prop.
# column13: age 60-64 prop.
# column14: age 65-69 prop.
# column15: age 70-74 prop.
# column16: age 75-79 prop.
# column17: age 80-84 prop.
# column18: age 85+ prop.
def age_group_top(age, top):
    children_poportion = age[0] + age[1] + age[2] + age[3]
    young_adult_poportion = age[4] + age[5] + age[6] + age[7] + age[8]
    middle_adult_poportion = age[9] + age[10] + age[11] + age[12]
    senior_poportion = age[13] + age[14] + age[15] + age[16] + age[17]
    age_group_list = [('Children', children_poportion), ('Young Adult', young_adult_poportion), ('Middle Adult', middle_adult_poportion), ('Senior', senior_poportion)]
    age_group_list = sorted(age_group_list, key = lambda x: -x[1])
    return age_group_list[top - 1][0]
age_groups_top1 = prepandemic.apply(lambda x: age_group_top([x['ageg1_2018'], x['ageg2_2018'], x['ageg3_2018'], x['ageg4_2018'], x['ageg5_2018'], x['ageg6_2018'], x['ageg7_2018'], x['ageg8_2018'], x['ageg9_2018'], x['ageg10_2018'], x['ageg11_2018'], x['ageg12_2018'], x['ageg13_2018'], x['ageg14_2018'], x['ageg15_2018'], x['ageg16_2018'], x['ageg17_2018'], x['ageg18_2018']], 1), axis = 1)
age_groups_top2 = prepandemic.apply(lambda x: age_group_top([x['ageg1_2018'], x['ageg2_2018'], x['ageg3_2018'], x['ageg4_2018'], x['ageg5_2018'], x['ageg6_2018'], x['ageg7_2018'], x['ageg8_2018'], x['ageg9_2018'], x['ageg10_2018'], x['ageg11_2018'], x['ageg12_2018'], x['ageg13_2018'], x['ageg14_2018'], x['ageg15_2018'], x['ageg16_2018'], x['ageg17_2018'], x['ageg18_2018']], 2), axis = 1)
age_groups_top3 = prepandemic.apply(lambda x: age_group_top([x['ageg1_2018'], x['ageg2_2018'], x['ageg3_2018'], x['ageg4_2018'], x['ageg5_2018'], x['ageg6_2018'], x['ageg7_2018'], x['ageg8_2018'], x['ageg9_2018'], x['ageg10_2018'], x['ageg11_2018'], x['ageg12_2018'], x['ageg13_2018'], x['ageg14_2018'], x['ageg15_2018'], x['ageg16_2018'], x['ageg17_2018'], x['ageg18_2018']], 3), axis = 1)
age_groups_top4 = prepandemic.apply(lambda x: age_group_top([x['ageg1_2018'], x['ageg2_2018'], x['ageg3_2018'], x['ageg4_2018'], x['ageg5_2018'], x['ageg6_2018'], x['ageg7_2018'], x['ageg8_2018'], x['ageg9_2018'], x['ageg10_2018'], x['ageg11_2018'], x['ageg12_2018'], x['ageg13_2018'], x['ageg14_2018'], x['ageg15_2018'], x['ageg16_2018'], x['ageg17_2018'], x['ageg18_2018']], 4), axis = 1)

In [107]:
# Get all education adata
education = pd.read_excel("final/Education_By_County.xls")
education = education.drop(education[pd.isna(education['2013 Rural-urban Continuum Code'])].index)

In [155]:
states_map = pd.read_csv("final/states.csv")
states_map

for index, row in states_map.iterrows():
    pc = len(pandemic[pandemic['stname'] == row[1]])
    ec = len(education[education['State'] == row[0]])
    if pc != ec:
        print(f"In education data set[{row[0]}]: {ec} rows, in pandemic dataset[{row[1]}]: {pc} rows")  

In education data set[VA]: 134 rows, in pandemic dataset[Virginia]: 133 rows


In [156]:
va_pandemic = pandemic[pandemic['stname'] == 'Virginia']
va_education = education[education['State'] == 'VA']
for index, row in va_education.iterrows():
    area = row['Area name']
    if len(va_pandemic[va_pandemic['ctyname'] == area]) == 0:
        print(f"{area} is missing in va_pandemic")

Bedford city is missing in va_pandemic


In [173]:
education = education.drop(education[education['Area name'] == 'Bedford city'].index)
education.describe()

Unnamed: 0,FIPS Code,2013 Rural-urban Continuum Code,2013 Urban Influence Code,"Less than a high school diploma, 2014-18","High school diploma only, 2014-18","Some college or associate's degree, 2014-18","Bachelor's degree or higher, 2014-18","Percent of adults with less than a high school diploma, 2014-18","Percent of adults with a high school diploma only, 2014-18","Percent of adults completing some college or associate's degree, 2014-18","Percent of adults with a bachelor's degree or higher, 2014-18"
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,31393.60528,4.937888,5.18882,8548.536,18610.24,19842.49,21573.41,13.743913,34.182484,30.506398,21.566149
std,16292.078954,2.724344,3.506848,36787.15,50235.28,60152.69,78830.98,6.678021,7.173804,5.364564,9.36267
min,1001.0,1.0,1.0,4.0,15.0,24.0,0.0,1.2,5.5,5.8,0.0
25%,19032.5,2.0,2.0,1019.0,2864.75,2311.5,1231.75,8.8,29.7,27.0,15.1
50%,30024.0,6.0,5.0,2579.0,6415.0,5270.0,3207.5,12.3,34.4,30.5,19.3
75%,46105.5,7.0,8.0,5881.0,14897.5,14138.75,10177.25,17.7,39.2,34.1,25.5
max,72153.0,9.0,12.0,1460718.0,1416482.0,1790808.0,2177481.0,66.3,55.6,57.3,78.5


In [167]:
result = pd.concat([state, county, populationdensity_category , ethnic_group_top1, ethnic_group_top2, ethnic_group_top3,
          age_groups_top1, age_groups_top2, age_groups_top3, age_groups_top4], axis=1)

In [169]:
result

Unnamed: 0,stname,ctyname,0,1,2,3,4,5,6,7
0,Alabama,Autauga County,2_Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
1,Alabama,Baldwin County,2_Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
2,Alabama,Barbour County,1_Very Low,Black,White,Hispanic,Young Adult,Middle Adult,Children,Senior
3,Alabama,Bibb County,1_Very Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
4,Alabama,Blount County,2_Low,White,Hispanic,Black,Young Adult,Middle Adult,Children,Senior
5,Alabama,Bullock County,1_Very Low,Black,White,Hispanic,Young Adult,Middle Adult,Children,Senior
6,Alabama,Butler County,1_Very Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
7,Alabama,Calhoun County,2_Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
8,Alabama,Chambers County,2_Low,White,Black,Hispanic,Young Adult,Middle Adult,Children,Senior
9,Alabama,Cherokee County,2_Low,White,Black,Hispanic,Middle Adult,Young Adult,Senior,Children
