## Gathering state-level features

The purpose of this section is to gather state-level features that may affect the degree to which a given state is suspectible or resistant to a virus such as the flu or Covid-19. Collecting these state-level characteristics can help us identify which features are responsible for the correlation in viral infection rates between states, and thus can also be used to quantify the correlation between states based on fundamental attributes of the states rather than just the raw wILI time series. 

In [466]:
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

The density of a state is a natural feature to include because the denser a location, the more easily a virus can spread (look no further than NYC right now). However, it wouldn't make sense to report the density of a state because, for example, the high population density in Manhattan shouldn't be influenced by the fact that upstate New York State has a massive amount of scarsely populated land. Instead, a more sensible measure is a weighted average of the densities of each county in a given state, where the weights are the fraction of the state population that lives in the given county. 

In [467]:
# dataset that reports the land area in square miles of each county in the U.S.
land_df = pd.read_csv('land_area.csv')

# dataset that reports the population of each county in the U.S.
popn_df = pd.read_csv('population.csv')

In [468]:
land_df.head()

Unnamed: 0,Areaname,STCOU,LND010190F,LND010190D,LND010190N1,LND010190N2,LND010200F,LND010200D,LND010200N1,LND010200N2,LND110180F,LND110180D,LND110180N1,LND110180N2,LND110190F,LND110190D,LND110190N1,LND110190N2,LND110200F,LND110200D,LND110200N1,LND110200N2,LND110210F,LND110210D,LND110210N1,LND110210N2,LND210190F,LND210190D,LND210190N1,LND210190N2,LND210200F,LND210200D,LND210200N1,LND210200N2
0,UNITED STATES,0,0,3787425.08,0,0,0,3794083.06,0,0,0,3539289.16,0,0,0,3536341.73,0,0,0,3537438.44,0,0,0,3531905.43,0,0,0,251083.35,0,0,0,256644.62,0,0
1,ALABAMA,1000,0,52422.94,0,0,0,52419.02,0,0,0,50767.18,0,0,0,50750.23,0,0,0,50744.0,0,0,0,50645.33,0,0,0,1672.71,0,0,0,1675.01,0,0
2,"Autauga, AL",1001,0,604.49,0,0,0,604.45,0,0,0,597.04,0,0,0,596.01,0,0,0,595.97,0,0,0,594.44,0,0,0,8.48,0,0,0,8.48,0,0
3,"Baldwin, AL",1003,0,2027.08,0,0,0,2026.93,0,0,0,1589.42,0,0,0,1596.53,0,0,0,1596.35,0,0,0,1589.78,0,0,0,430.55,0,0,0,430.58,0,0
4,"Barbour, AL",1005,0,904.59,0,0,0,904.52,0,0,0,883.89,0,0,0,885.0,0,0,0,884.9,0,0,0,884.88,0,0,0,19.59,0,0,0,19.61,0,0


In [469]:
popn_df.head()

Unnamed: 0,Areaname,STCOU,PST045200F,PST045200D,PST045200N1,PST045200N2,PST045201F,PST045201D,PST045201N1,PST045201N2,PST045202F,PST045202D,PST045202N1,PST045202N2,PST045203F,PST045203D,PST045203N1,PST045203N2,PST045204F,PST045204D,PST045204N1,PST045204N2,PST045205F,PST045205D,PST045205N1,PST045205N2,PST045206F,PST045206D,PST045206N1,PST045206N2,PST045207F,PST045207D,PST045207N1,PST045207N2,PST045208F,PST045208D,PST045208N1,PST045208N2,PST045209F,PST045209D,PST045209N1,PST045209N2
0,UNITED STATES,0,0,282171957,0,0,0,285081556,0,0,0,287803914,0,0,0,290326418,0,0,0,293045739,0,0,0,295753151,0,0,0,298593212,0,0,0,301579895,0,0,0,304374846,0,0,0,307006550,0,0
1,ALABAMA,1000,0,4451849,0,0,0,4464034,0,0,0,4472420,0,0,0,4490591,0,0,0,4512190,0,0,0,4545049,0,0,0,4597688,0,0,0,4637904,0,0,0,4677464,0,0,0,4708708,0,0
2,"Autauga, AL",1001,0,43872,0,0,0,44434,0,0,0,45157,0,0,0,45762,0,0,0,46933,0,0,0,47870,0,0,0,49105,0,0,0,49834,0,0,0,50354,0,0,0,50756,0,0
3,"Baldwin, AL",1003,0,141358,0,0,0,144988,0,0,0,148141,0,0,0,151707,0,0,0,156573,0,0,0,162564,0,0,0,168516,0,0,0,172815,0,0,0,176212,0,0,0,179878,0,0
4,"Barbour, AL",1005,0,29035,0,0,0,29223,0,0,0,29289,0,0,0,29480,0,0,0,29458,0,0,0,29452,0,0,0,29556,0,0,0,29736,0,0,0,29836,0,0,0,29737,0,0


In [470]:
land_df = land_df[['Areaname', 'LND010190D']]
popn_df = popn_df[['Areaname', 'PST045200D']]

In [471]:
# limit analysis to Lower 48 states
lower_48 = ["AL", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
            "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
            "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
            "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
            "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

state_end = tuple(', ' + abbrev for abbrev in lower_48)

In [472]:
# ignore AL and HI
filtered_land_df = land_df[land_df.Areaname.str.endswith(state_end)]
filtered_popn_df = popn_df[land_df.Areaname.str.endswith(state_end)]

In [473]:
filtered_popn_df.shape

(3111, 2)

In [475]:
# There are 5 counties in Virginia that are included twice in both the land area and population datasets
# so we need to ignore the duplicated row
virginia_counties_df = filtered_land_df[filtered_land_df.Areaname.str.endswith(', VA')]
indices_to_delete = []
counties_set = set()
for index, row in virginia_counties_df.iterrows():
    county = row['Areaname']
    if county not in counties_set:
        counties_set.add(county)
    else:
        indices_to_delete.append(index)
        
filtered_land_df = filtered_land_df[~filtered_land_df.index.isin(indices_to_delete)]
filtered_popn_df = filtered_popn_df[~filtered_popn_df.index.isin(indices_to_delete)]

In [477]:
len(filtered_popn_df)

3106

In [478]:
# merge land area and population datasets
combined_df = pd.merge(filtered_land_df, filtered_popn_df, on='Areaname', how='inner')

In [480]:
# extract state from Areaname column
combined_df['state'] = combined_df.Areaname.str[-2:]
combined_df.head()

In [482]:
# rename column names
combined_df.rename(columns={'Areaname': 'county', 'LND010190D': 'area', 'PST045200D': 'popn'}, inplace=True)

In [483]:
# fill in missing value of land area of Broomfield, CO from Wikipedia page
combined_df.loc[combined_df.county == 'Broomfield, CO', 'area'] = 33.00

In [484]:
# calculate density of each county by dividing population by land area
combined_df['density'] = combined_df['popn'] / combined_df['area']

In [485]:
# calculate total population of each state accross all counties
state2pop = combined_df.groupby('state').agg({'popn': sum}).to_dict()['popn']
combined_df['state_popn'] = [state2pop[state] for state in combined_df.state]
combined_df.head()

In [487]:
# calculate density metric for each state by weighing the density of each population by the fraction of 
# the state population that lives in the given state
state2density_metric = (combined_df.groupby('state').
                        apply(lambda x: round(x['popn'] * (x['density'] ** 1) / x['state_popn'], 1))
                        .groupby('state').sum()).to_dict()

In [488]:
# sort states in order of decreasing density
sorted_density_metrics = sorted(list(state2density_metric.values()), reverse=True)
density_metric2state = {v: k for k, v in state2density_metric.items()}
ordered_density_metric2state = {x: density_metric2state[x] for x in sorted_density_metrics}

In [491]:
# create dataframe with this first state-level feature
state_stats_df = pd.DataFrame(ordered_density_metric2state.keys(), columns=['density_metric'], 
                              index=ordered_density_metric2state.values())

In [492]:
state_stats_df.head()

Unnamed: 0,density_metric
NY,10711.4
NJ,2789.6
PA,1957.6
IL,1761.9
MD,1737.6


In [493]:
# dataset that lists the average latitude of each state
latlong_df = pd.read_csv('statelatlong.csv')
latlong_df.head()

In [495]:
# include this latitude value in the feature dataframe
state_stats_df1 = (pd.merge(state_stats_df, latlong_df[['Latitude', 'State']],
                           left_index=True, right_on='State').drop(columns=['State']))
state_stats_df1.index = ordered_density_metric2state.values()

In [496]:
# states in Lower 48 that are on either the Atlantic or Pacific Ocean. This can potentially be an important
# feature because tourists and immigrants usually fly into the country in a coastal location
coastal_states = set('ME NH MA RI CT NY NJ PA MD DE VA NC SC GA FL WA OR CA'.split())
state_stats_df1['is_coastal'] = [int(state in coastal_states) for state in state_stats_df.index]

A potentially important state-level feature is the number of airline passengers arriving in the state. As we've seen with Covid-19, clusters have started in particular locations because visiters have come into these places with the virus from foreigns countries. The most readily available source for this data are the 'List of airports in [state]' Wikipedia article for each state. Each of these pages contains the number of commerical passenger boardings in 2016 for each airport in the state. Although commerical passenger arrivals are not included, it's reasonable to assume that the number of boardings and arrivals are closely related to each other. The values in the dictionary below represents the sum of the number of commerical passenger arrivals for the major airports in each state. Note: the number of major airports variesby state (e.g. the only major airport in Massachusetts in Logan, there are no major airports in Delaware, and there are three major airports in Kentucky (Cincinatti, Louisville and Lexington). Finally, the number of annual boardings in each state in normalized by the population of the given state, as this metric represents the relative influence of air traffic on the given state. 

In [498]:
state2passengers = {'NY': 50868391, 
                    'PA': 15285948 + 4670954 + 636916, 
                    'NJ': 19923009 + 589091,
                    'MD': 13371816,
                    'IL': round((83245472 / 2) + (22027737 / 2)),
                    'MA': 17759044,
                    'VA': 11470854 + 10596942 + 1777648 + 1602631,
                    'MO': 6793076 + 5391557 + 462126,
                    'CA': (39636042 + 25707101 + 10340164 + 5934639 + 5321603 + 5217242 
                           + 4969366 + 2104625 + 2077892 + 1386357 + 995801 + 761298),
                    'MI': 16847135 + 1334979 + 398508,
                    'CO': 28267394 + 657694,
                    'MN': 18123844,
                    'TX': 31283579 + 20062072 + 7554596 + 6285181 + 6095545 + 4179994 + 1414376,
                    'RI': 1803000,
                    'GA': 50501858 + 1056265,
                    'OH': 4083476 + 3567864 + 1019922 + 685553,
                    'CT': 2982194,
                    'IN': 4216766 + 360369 + 329957 + 204352,
                    'DE': 0,
                    'KY': 3269979 + 1631494 + 638316,
                    'FL': (20875813 + 20283541 + 14263270 + 9194994 + 4239261 + 3100624 + 2729129 
                           + 1321675 + 986766 + 915672 + 589860),
                    'NE': 2127387 + 162876,
                    'UT': 11143738,
                    'OR': 9071154,
                    'TN': 6338517 + 2016089 + 887103,
                    'LA': 5569705 + 364200,
                    'OK': 1796473 + 1342315,
                    'NC': 21511880 + 5401714 + 848261,
                    'KS': 781944,
                    'WA': 21887110 + 1570652,
                    'WI': 3496724 + 1043185 + 348026 + 314909,
                    'NH': 995403,
                    'AL': 1304467 + 527801 + 288209 + 173210,
                    'NM': 2341719,
                    'IA': 1216357 + 547786,
                    'AZ': 20896265 + 1594594 + 705731,
                    'SC': 1811695 + 991276 + 944849 + 553658,
                    'AR': 958824 + 673810,
                    'WV': 213412,
                    'ID': 1633507,
                    'NV': 22833267 + 1771864,
                    'ME': 886343 + 269013,
                    'MS': 491464 + 305157,
                    'VT': 593311,
                    'SD': 510105 + 272537,
                    'ND': 402976 + 273980 + 150634 + 132557 + 68829,
                    'MT': 553245 + 423213 + 381582 + 247816 + 176730 + 103239,
                    'WY': 342044 + 92805}


In [499]:
# population of each state according to the 2010 census
state2popn_2010 = {
        'AL': 4779736,
        'AR': 2915918,
        'AZ': 6392017,
        'CA': 37253956,
        'CO': 5029196,
        'CT': 3574097,
        'DE': 897934,
        'FL': 18801310,
        'GA': 9687653,
        'IA': 3046355,
        'ID': 1567582,
        'IL': 12830632,
        'IN': 6483802,
        'KS': 2853118,
        'KY': 4339367,
        'LA': 4533372,
        'MA': 6547629,
        'MD': 5773552,
        'ME': 1328361,
        'MI': 9883640,
        'MN': 5303925,
        'MO': 5988927,
        'MS': 2967297,
        'MT': 989415,
        'NC': 9535483,
        'ND': 672591,
        'NE': 1826341,
        'NH': 1316470,
        'NJ': 8791894,
        'NM': 2059179,
        'NV': 2700551,
        'NY': 19378102,
        'OH': 11536504,
        'OK': 3751351,
        'OR': 3831074,
        'PA': 12702379,
        'RI': 1052567,
        'SC': 4625364,
        'SD': 814180,
        'TN': 6346105,
        'TX': 25145561,
        'UT': 2763885,
        'VA': 8001024,
        'VT': 625741,
        'WA': 6724540,
        'WI': 5686986,
        'WV': 1852994,
        'WY': 563626
}

In [500]:
state_stats_df1['airport_boardings'] = [state2passengers[state] / state2popn_2010[state]
                                        for state in state_stats_df.index]

In [501]:
state_stats_df1.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals
NY,10711.4,40.705626,1,2.625045
NJ,2789.6,40.143006,1,2.33307
PA,1957.6,40.994593,1,1.621257
IL,1761.9,39.739318,0,4.102417
MD,1737.6,38.806352,1,2.316047


In [504]:
abbrev2state = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

state2abbrev = {v: k for k, v in abbrev2state.items()}

The age profile of each state is a potentially important feature as suspectibility to a virus varies by age group (e.g. the elderly are at a heightened risk).

In [None]:
# dataframe that reports the fraction of each state's population that falls into a set of age categories
age_df = pd.read_csv('age.csv')

In [506]:
# merge age dataframe with dataframe that contains the rest of the features
age_df['Location'] = [state2abbrev[state] for state in age_df.Location]
state_stats_df2 = (pd.merge(state_stats_df1, age_df, left_index=True, right_on='Location')
                  .drop(columns=['Location']))
state_stats_df2.index = ordered_density_metric2state.values()

In [507]:
state_stats_df2.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15


Viruses tend to spread more in colder weather, so the climate of each state is important to consider.

In [508]:
# dataset that reports the average temperature of each state during each of the four seasons of the year
temps_df = pd.read_csv('temps.csv')

In [509]:
temps_df['State'] = [state2abbrev[state] for state in temps_df.State]

In [510]:
# merge temperature dataframe with dataframe that contains the rest of the features
state_stats_df3 = (pd.merge(state_stats_df2, temps_df, left_index=True, right_on='State')
                  .drop(columns=['State']))
state_stats_df3.index = ordered_density_metric2state.values()

In [511]:
state_stats_df3.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+,spring,summer,fall,winter
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16,43.6,66.5,48.1,23.3
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16,50.6,72.2,54.8,33.0
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18,47.4,68.6,50.9,28.4
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15,51.6,73.4,53.8,28.3
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15,52.8,73.3,56.1,34.7


It's possible that state-level political policies have an impact on the proliferation of virus infections. The Cook Partisan Voting Index taken from Wikipedia assigns a number to each state that indicates how strongly the state leads toward the Republican or Democratic Party based on recent state and federal elections. In our convention, a positive value signifies leaning Republican, while a negative value signifies leading Democratic. 

In [512]:
state2partisan_score = {
        'AL': 14,
        'AR': 15,
        'AZ': 5,
        'CA': -12,
        'CO': 1,
        'CT': -6,
        'DE': -6,
        'FL': 2,
        'GA': 5,
        'IA': 3,
        'ID': 19,
        'IL': -7,
        'IN': 9,
        'KS': 13,
        'KY': 15,
        'LA': 11,
        'MA': -12,
        'MD': -12,
        'ME': -3,
        'MI': -1,
        'MN': -1,
        'MO': 9,
        'MS': 9,
        'MT': 11,
        'NC': 3,
        'ND': 17,
        'NE': 14,
        'NH': 0,
        'NJ': -7,
        'NM': -3,
        'NV': -1,
        'NY': -12,
        'OH': 3,
        'OK': 20,
        'OR': -5,
        'PA': 0,
        'RI': -10,
        'SC': 8,
        'SD': 15,
        'TN': 14,
        'TX': 8,
        'UT': 20,
        'VA': -1,
        'VT': -15,
        'WA': -7,
        'WI': 0,
        'WV': 19,
        'WY': 25
}

In [513]:
state_stats_df3['partisan_score'] = [state2partisan_score[state] for state in state_stats_df3.index]

The following dataset was taken from a Stat139 problem set last semester and contains a range of socioeconomic, demographic and health indicators. These include:

Cancer: prevalence of cancer per 100,000 individuals

Hispanic: percent of adults that are hispanic

Minority: percent of adults that are nonwhite

Female: percent of adults that are female

Income: median income

Nodegree: percent of adults who have not completed high school

Bachelor: percent of adults with a bachelor’s degree

Inactive: percent of adults who do not exercise in their leisure time

Obesity: percent of individuals with BMI > 30

Cancer: prevalence of cancer per 100,000 individuals

We're not considering unemployment rate, as these rates are likely no longer accurate for many states.

Just as with the density metric, the state-level value for each of these features is determined by calculating a weighted average of the measurements for each county, where the weights are the fraction of the state population that lives in the given county. 

In [514]:
county_metrics_df = pd.read_csv('county_metrics.csv')

In [515]:
county_metrics_df.head()

Unnamed: 0,state,fipscode,county,population,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,density,cancer
0,Colorado,8117,Summit County,27239,15.173,4.918,45.996,2.5,68352,5.4,48.1,8.1,13.1,46.0,46.2
1,Colorado,8037,Eagle County,53653,30.04,5.169,47.231,3.1,76661,10.1,47.3,9.4,11.8,31.0,47.1
2,Idaho,16067,Minidoka County,19226,34.07,5.611,49.318,3.7,46332,24.1,11.8,18.3,34.2,80.0,61.8
3,Colorado,8113,San Miguel County,7558,10.154,4.747,46.808,3.7,59603,4.7,54.4,12.4,16.7,5.7,62.6
4,Utah,49051,Wasatch County,21600,13.244,4.125,48.812,3.4,65207,9.5,34.4,13.9,23.0,257.8,68.3


In [516]:
county_metrics_df['state'] = [state2abbrev[state] for state in county_metrics_df.state]

In [517]:
county_metrics_df = county_metrics_df[county_metrics_df.state.isin(lower_48)]

In [518]:
county_metrics_df.head()

Unnamed: 0,state,fipscode,county,population,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,density,cancer
0,CO,8117,Summit County,27239,15.173,4.918,45.996,2.5,68352,5.4,48.1,8.1,13.1,46.0,46.2
1,CO,8037,Eagle County,53653,30.04,5.169,47.231,3.1,76661,10.1,47.3,9.4,11.8,31.0,47.1
2,ID,16067,Minidoka County,19226,34.07,5.611,49.318,3.7,46332,24.1,11.8,18.3,34.2,80.0,61.8
3,CO,8113,San Miguel County,7558,10.154,4.747,46.808,3.7,59603,4.7,54.4,12.4,16.7,5.7,62.6
4,UT,49051,Wasatch County,21600,13.244,4.125,48.812,3.4,65207,9.5,34.4,13.9,23.0,257.8,68.3


In [519]:
state2pop_ = county_metrics_df.groupby('state').agg({'population': sum}).to_dict()['population']
county_metrics_df['state_popn'] = [state2pop_[state] for state in county_metrics_df.state]

In [520]:
county_metrics_df.head()

Unnamed: 0,state,fipscode,county,population,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,density,cancer,state_popn
0,CO,8117,Summit County,27239,15.173,4.918,45.996,2.5,68352,5.4,48.1,8.1,13.1,46.0,46.2,5022460
1,CO,8037,Eagle County,53653,30.04,5.169,47.231,3.1,76661,10.1,47.3,9.4,11.8,31.0,47.1,5022460
2,ID,16067,Minidoka County,19226,34.07,5.611,49.318,3.7,46332,24.1,11.8,18.3,34.2,80.0,61.8,1351143
3,CO,8113,San Miguel County,7558,10.154,4.747,46.808,3.7,59603,4.7,54.4,12.4,16.7,5.7,62.6,5022460
4,UT,49051,Wasatch County,21600,13.244,4.125,48.812,3.4,65207,9.5,34.4,13.9,23.0,257.8,68.3,2481585


In [521]:
state2hispanic = (county_metrics_df.groupby('state').
                  apply(lambda x: round((x['population'] * x['hispanic']) / x['state_popn'], 1))
                  .groupby('state').sum()).to_dict()

In [522]:
metrics = ['hispanic', 'minority', 'female', 'unemployed', 'income', 'nodegree', 'bachelor', 'inactivity',
          'obesity', 'cancer']

for metric in metrics:
    state2metric = (county_metrics_df.groupby('state').
                    apply(lambda x: round((x['population'] * x[metric]) / x['state_popn'], 3))
                    .groupby('state').sum()).to_dict()
    
    denom = 1000 if metric == 'income' else 1
    state_stats_df3[metric] = [state2metric[state] / denom for state in state_stats_df3.index]

The more people travel between states, the more closely related the states should be in terms of rate of virus infections. The Census Bureau Journey to Work datset reports the number of people that commute from any given county in the county to any other county in the country. This means we can aggregate these county to county commuting flows to determine the number of people that commute between any two states. From this data, we can create a symmetric matrix where the $i,j$ and $j,i$ elements represent the number of people that commute from state $i$ to state $j$ plus the number of people that commute from state $j$ to state $i$. However, just as with the number of annual boardings in each state, the final value of the number of people who commute between two states in normalized by the popualation of the given state. This means that this commuting matrix is no longer symmetric because the populations of state $i$ and state $j$ are different. 

In [523]:
commuting_df_complete = pd.read_csv('commuting.csv')

In [524]:
commuting_df_complete.columns

Index(['State FIPS Code', 'County FIPS Code', 'State Name', 'County Name',
       'State FIPS Code.1', 'County FIPS Code.1', 'State Name.1',
       'County Name.1', 'Workers in Commuting Flow', ' Margin of Error',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')

In [525]:
commuting_df = commuting_df_complete[['State Name', 'State Name.1', 'Workers in Commuting Flow']]

In [526]:
commuting_df.rename(columns={'State Name': 'home_state', 
                             'State Name.1': 'work_state', 
                             'Workers in Commuting Flow': 'commuters'}, 
                   inplace=True)

In [527]:
lower_48_full_name = [abbrev2state[abbrev] for abbrev in lower_48]
commuting_df = commuting_df[commuting_df.work_state.isin(lower_48_full_name)]

In [528]:
commuting_df['home_state'] = [state2abbrev[state] for state in commuting_df.home_state]
commuting_df['work_state'] = [state2abbrev[state] for state in commuting_df.work_state]

In [529]:
commuting_df.head(10)

Unnamed: 0,home_state,work_state,commuters
0,AL,AL,8828
1,AL,AL,22
2,AL,AL,7
3,AL,AL,309
4,AL,AL,17
5,AL,AL,11
6,AL,AL,210
7,AL,AL,2244
8,AL,AL,27
9,AL,AL,35


In [530]:
commuting_df['commuters'] = commuting_df['commuters'].apply(lambda x: int(''.join([y for y in x if y.isdigit()])))

In [531]:
commuting_groupby_df = (commuting_df.groupby(['work_state', 'home_state'], as_index=False)
                       .agg({'commuters': 'sum'}))

In [532]:
# calculate the number of commuters between two states for all pairs of states
for work_state in state_stats_df3.index:
    vals = []
    for home_state in state_stats_df3.index:
        try:
            num1 = int((commuting_groupby_df[(commuting_groupby_df.work_state == work_state)
                       & (commuting_groupby_df.home_state == home_state)].commuters))
            num2 = int((commuting_groupby_df[(commuting_groupby_df.work_state == home_state)
                       & (commuting_groupby_df.home_state == work_state)].commuters))
            num = num1 + num2
            
            num /= state2popn_2010[work_state]
            
        except TypeError:
            num = 0

        vals.append(num)

    state_stats_df3[work_state + '_dest'] = vals

In [533]:
state_stats_df3.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+,spring,summer,fall,winter,partisan_score,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,cancer,NY_dest,NJ_dest,PA_dest,IL_dest,MD_dest,MA_dest,VA_dest,CA_dest,RI_dest,MI_dest,TX_dest,MO_dest,MN_dest,CT_dest,GA_dest,OH_dest,CO_dest,DE_dest,FL_dest,IN_dest,UT_dest,KY_dest,NE_dest,TN_dest,OR_dest,LA_dest,NC_dest,OK_dest,WA_dest,KS_dest,WI_dest,NH_dest,AZ_dest,SC_dest,AL_dest,IA_dest,NM_dest,WV_dest,NV_dest,AR_dest,ID_dest,ME_dest,MS_dest,VT_dest,SD_dest,ND_dest,MT_dest,WY_dest
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16,43.6,66.5,48.1,23.3,-12,18.737,29.756,51.436,5.347,61.622797,14.719,33.425,24.046,24.675,200.165,0.911538,0.061864,0.004793,0.000216,0.000722,0.001985,0.000495,0.000199,0.001692,0.000196,0.000148,0.000147,0.000207,0.033303,0.000321,0.000261,0.000237,0.002078,0.00057,0.000153,0.000171,0.000122,0.000156,0.000232,9.8e-05,0.000264,0.000368,8.9e-05,0.000126,0.000117,0.000155,0.001047,0.000143,0.000251,0.000122,0.000119,8.8e-05,0.000168,0.000233,9.9e-05,8e-05,0.000857,0.000131,0.012476,8.1e-05,0.0,0.000105,0.000209
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16,50.6,72.2,54.8,33.0,-7,19.372,27.143,51.199,5.676,72.829584,11.59,36.24,22.909,25.672,200.319,0.028068,0.823014,0.019686,0.000128,0.000793,0.000447,0.000315,7.2e-05,0.000472,8.9e-05,9.9e-05,0.000112,7.9e-05,0.001903,0.000142,0.000171,0.000154,0.016296,0.000245,8.1e-05,6.7e-05,9.1e-05,6.5e-05,9.8e-05,5.2e-05,8e-05,0.000205,5.9e-05,6.7e-05,6e-05,6.9e-05,0.000368,0.000132,0.000183,5.1e-05,1.5e-05,2.5e-05,0.000137,8.1e-05,4.5e-05,4.1e-05,0.000325,3.4e-05,0.000566,0.0,0.0,0.000114,0.0
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18,47.4,68.6,50.9,28.4,0,6.767,17.371,51.067,5.199,54.9607,11.11,28.104,23.155,29.331,230.835,0.003142,0.028442,0.877457,0.000199,0.012778,0.000454,0.000731,7.9e-05,0.000352,0.000216,0.000143,0.000188,0.000158,0.000587,0.000222,0.002901,0.000164,0.069599,0.000248,0.000159,0.000125,0.000229,5.8e-05,0.000202,5.3e-05,0.00018,0.000314,0.000203,5.8e-05,0.000106,0.000117,0.000366,0.000156,0.00025,0.000151,9.1e-05,0.0001,0.011997,0.000114,0.000207,6.5e-05,0.000299,0.000101,0.000294,8e-05,0.000137,0.000137,0.000232
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15,51.6,73.4,53.8,28.3,-7,16.918,22.729,50.902,5.963,59.266824,12.397,32.078,21.199,27.34,225.471,0.000143,0.000187,0.000201,0.900515,0.00015,0.000173,0.000197,0.000141,0.000231,0.000784,0.000194,0.018036,0.000492,0.00021,0.000285,0.000342,0.000297,0.000204,0.000304,0.015667,0.000116,0.001515,0.000491,0.000354,9.2e-05,0.000159,0.000204,0.000159,0.000138,0.000288,0.011498,0.000137,0.000223,0.000114,0.000131,0.018256,7.1e-05,0.000108,0.000336,0.000304,0.00019,9.6e-05,0.000181,0.000142,0.000141,0.000346,0.000173,0.000213
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15,52.8,73.3,56.1,34.7,-12,9.407,40.312,51.537,5.266,75.296062,11.089,37.129,21.545,28.94,225.486,0.000215,0.000521,0.005808,6.8e-05,0.845245,0.000221,0.024575,3.8e-05,0.000227,6.7e-05,7.6e-05,9.7e-05,7.7e-05,0.00014,0.000167,0.000103,6.1e-05,0.045597,0.000184,7.6e-05,4.1e-05,7.4e-05,4.3e-05,8.7e-05,3.7e-05,6.8e-05,0.000281,1.5e-05,7.6e-05,4.5e-05,5.4e-05,0.000109,2.5e-05,0.000143,6.5e-05,2.3e-05,1.6e-05,0.015275,5e-05,4.2e-05,0.0,0.00014,5.9e-05,8.3e-05,0.0,0.000113,1.1e-05,0.0


States that are in close proximity may be similarly affected by viruses. Therefore, we include a column for each state in the design matrix that denotes whether that given states borders each of the other states. 

In [None]:
# dictionary that maps each state in the Lower 48 to the states that directly border it or are not contiguous
# but are very close (e.g. NJ and CT)
state2neighbors = {'AL': {'AL', 'MS', 'TN', 'FL', 'GA', 'NC', 'SC'},
                  'GA': {'GA', 'TN', 'FL', 'AL', 'SC', 'NC', 'MS'},
                  'FL': {'FL', 'GA', 'AL', 'MS', 'SC'},
                  'MS': {'MS', 'AL', 'TN', 'FL', 'LA', 'AR', 'GA'},
                  'LA': {'LA', 'TX', 'AR', 'MS', 'OK', 'AL'},
                  'SC': {'SC', 'FL', 'GA', 'NC', 'TN'},
                  'NC': {'NC', 'SC', 'GA', 'TN', 'VA', 'KY'},
                  'AR': {'AR', 'LA', 'TX', 'MS', 'TN', 'OK', 'MO', 'KY'},
                  'VA': {'VA', 'NC', 'KY', 'WV', 'TN', 'DC', 'MD', 'DE'},
                  'MD': {'MD', 'DC', 'VA', 'WV', 'DE', 'NJ', 'PA'},
                  'DE': {'DE', 'MD', 'DC', 'NJ', 'PA'},
                  'NJ': {'NJ', 'DE', 'MD', 'PA', 'NY', 'NJ', 'CT'},
                  'NY': {'NY', 'NJ', 'PA', 'CT', 'MA', 'VT'},
                  'CT': {'CT', 'NY', 'RI', 'MA', 'NJ'},
                  'RI': {'RI', 'CT', 'MA'},
                  'MA': {'MA', 'CT', 'RI', 'NH', 'VT', 'NY'},
                  'NH': {'NH', 'VT', 'ME', 'MA'},
                  'ME': {'ME', 'NH', 'MA', 'VT'},
                  'VT': {'VT', 'NH', 'NY', 'MA'},
                  'PA': {'PA', 'NY', 'NJ', 'MD', 'WV', 'OH', 'DE'},
                  'WV': {'WV', 'DC', 'MD', 'PA', 'OH', 'KY', 'VA'},
                  'OH': {'OH', 'PA', 'WV', 'MI', 'IN', 'KY'},
                  'MI': {'MI', 'OH', 'WI', 'IN', 'IL'},
                  'KY': {'KY', 'WV', 'OH', 'IN', 'IL', 'MO', 'TN', 'VA', 'AR', 'NC'},
                  'TN': {'TN', 'KY', 'VA', 'NC', 'SC', 'GA', 'AL', 'MS', 'AR', 'MO', 'IL'},
                  'IN': {'IN', 'KY', 'OH', 'MI', 'IL', 'WI'},
                  'IL': {'IL', 'IN', 'MI', 'WI', 'IA', 'MO', 'KY', 'TN'},
                  'WI': {'WI', 'IL', 'MN', 'MI', 'IA'},
                  'MN': {'MN', 'MI', 'WI', 'IA', 'ND', 'SD', 'NE', 'IL'},
                  'IA': {'IA', 'WI', 'MN', 'IL', 'MO', 'KS', 'NE', 'SD'},
                  'MO': {'MO', 'IA', 'IL', 'KY', 'TN', 'AR', 'OK', 'KS', 'NE'},
                  'ND': {'ND', 'SD', 'MN', 'MT', 'WY'},
                  'SD': {'SD', 'ND', 'MN', 'IA', 'NE', 'MT', 'WY'},
                  'NE': {'NE', 'SD', 'IA', 'MO', 'KS', 'WY', 'CO'},
                  'KS': {'KS', 'NE', 'IA', 'MO', 'AR', 'OK', 'CO', 'TX', 'NM'},
                  'OK': {'OK', 'KS', 'MO', 'AR', 'TX', 'NM', 'CO', 'LA'},
                  'TX': {'TX', 'LA', 'AR', 'OK', 'NM', 'CO'},
                  'MT': {'MT', 'ND', 'SD', 'WY', 'ID'},
                  'WY': {'WY', 'MT', 'ND', 'SD', 'NE', 'CO', 'UT', 'ID'},
                  'CO': {'CO', 'WY', 'NE', 'KS', 'OK', 'TX', 'NM', 'UT', 'AZ'},
                  'NM': {'NM', 'CO', 'KS', 'OK', 'TX', 'AZ', 'UT'},
                  'ID': {'ID', 'MT', 'WY', 'UT', 'NV', 'WA', 'OR'},
                  'UT': {'UT', 'ID', 'WY', 'CO', 'NM', 'AZ', 'NV'},
                  'AZ': {'AZ', 'NM', 'CO', 'UT', 'NV', 'CA'},
                  'WA': {'WA', 'ID', 'OR'},
                  'OR': {'OR', 'WA', 'ID', 'NV', 'CA'},
                  'NV': {'NV', 'ID', 'OR', 'UT', 'AZ', 'CA'},
                  'CA': {'CA', 'OR', 'NV', 'AZ'}
                 }

In [534]:
for neighboring_state in state_stats_df3.index:
    states = [int(neighboring_state in state2neighbors[state]) for state in state_stats_df3.index]
    state_stats_df3[neighboring_state + '_is_neighbor'] = states     

In [535]:
state_stats_df3.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+,spring,summer,fall,winter,partisan_score,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,cancer,NY_dest,NJ_dest,PA_dest,IL_dest,MD_dest,MA_dest,VA_dest,CA_dest,RI_dest,MI_dest,TX_dest,MO_dest,MN_dest,CT_dest,GA_dest,OH_dest,CO_dest,DE_dest,FL_dest,IN_dest,UT_dest,KY_dest,NE_dest,TN_dest,OR_dest,LA_dest,NC_dest,OK_dest,WA_dest,KS_dest,WI_dest,NH_dest,AZ_dest,SC_dest,AL_dest,IA_dest,NM_dest,WV_dest,NV_dest,AR_dest,ID_dest,ME_dest,MS_dest,VT_dest,SD_dest,ND_dest,MT_dest,WY_dest,NY_is_neighbor,NJ_is_neighbor,PA_is_neighbor,IL_is_neighbor,MD_is_neighbor,MA_is_neighbor,VA_is_neighbor,CA_is_neighbor,RI_is_neighbor,MI_is_neighbor,TX_is_neighbor,MO_is_neighbor,MN_is_neighbor,CT_is_neighbor,GA_is_neighbor,OH_is_neighbor,CO_is_neighbor,DE_is_neighbor,FL_is_neighbor,IN_is_neighbor,UT_is_neighbor,KY_is_neighbor,NE_is_neighbor,TN_is_neighbor,OR_is_neighbor,LA_is_neighbor,NC_is_neighbor,OK_is_neighbor,WA_is_neighbor,KS_is_neighbor,WI_is_neighbor,NH_is_neighbor,AZ_is_neighbor,SC_is_neighbor,AL_is_neighbor,IA_is_neighbor,NM_is_neighbor,WV_is_neighbor,NV_is_neighbor,AR_is_neighbor,ID_is_neighbor,ME_is_neighbor,MS_is_neighbor,VT_is_neighbor,SD_is_neighbor,ND_is_neighbor,MT_is_neighbor,WY_is_neighbor
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16,43.6,66.5,48.1,23.3,-12,18.737,29.756,51.436,5.347,61.622797,14.719,33.425,24.046,24.675,200.165,0.911538,0.061864,0.004793,0.000216,0.000722,0.001985,0.000495,0.000199,0.001692,0.000196,0.000148,0.000147,0.000207,0.033303,0.000321,0.000261,0.000237,0.002078,0.00057,0.000153,0.000171,0.000122,0.000156,0.000232,9.8e-05,0.000264,0.000368,8.9e-05,0.000126,0.000117,0.000155,0.001047,0.000143,0.000251,0.000122,0.000119,8.8e-05,0.000168,0.000233,9.9e-05,8e-05,0.000857,0.000131,0.012476,8.1e-05,0.0,0.000105,0.000209,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16,50.6,72.2,54.8,33.0,-7,19.372,27.143,51.199,5.676,72.829584,11.59,36.24,22.909,25.672,200.319,0.028068,0.823014,0.019686,0.000128,0.000793,0.000447,0.000315,7.2e-05,0.000472,8.9e-05,9.9e-05,0.000112,7.9e-05,0.001903,0.000142,0.000171,0.000154,0.016296,0.000245,8.1e-05,6.7e-05,9.1e-05,6.5e-05,9.8e-05,5.2e-05,8e-05,0.000205,5.9e-05,6.7e-05,6e-05,6.9e-05,0.000368,0.000132,0.000183,5.1e-05,1.5e-05,2.5e-05,0.000137,8.1e-05,4.5e-05,4.1e-05,0.000325,3.4e-05,0.000566,0.0,0.0,0.000114,0.0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18,47.4,68.6,50.9,28.4,0,6.767,17.371,51.067,5.199,54.9607,11.11,28.104,23.155,29.331,230.835,0.003142,0.028442,0.877457,0.000199,0.012778,0.000454,0.000731,7.9e-05,0.000352,0.000216,0.000143,0.000188,0.000158,0.000587,0.000222,0.002901,0.000164,0.069599,0.000248,0.000159,0.000125,0.000229,5.8e-05,0.000202,5.3e-05,0.00018,0.000314,0.000203,5.8e-05,0.000106,0.000117,0.000366,0.000156,0.00025,0.000151,9.1e-05,0.0001,0.011997,0.000114,0.000207,6.5e-05,0.000299,0.000101,0.000294,8e-05,0.000137,0.000137,0.000232,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15,51.6,73.4,53.8,28.3,-7,16.918,22.729,50.902,5.963,59.266824,12.397,32.078,21.199,27.34,225.471,0.000143,0.000187,0.000201,0.900515,0.00015,0.000173,0.000197,0.000141,0.000231,0.000784,0.000194,0.018036,0.000492,0.00021,0.000285,0.000342,0.000297,0.000204,0.000304,0.015667,0.000116,0.001515,0.000491,0.000354,9.2e-05,0.000159,0.000204,0.000159,0.000138,0.000288,0.011498,0.000137,0.000223,0.000114,0.000131,0.018256,7.1e-05,0.000108,0.000336,0.000304,0.00019,9.6e-05,0.000181,0.000142,0.000141,0.000346,0.000173,0.000213,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15,52.8,73.3,56.1,34.7,-12,9.407,40.312,51.537,5.266,75.296062,11.089,37.129,21.545,28.94,225.486,0.000215,0.000521,0.005808,6.8e-05,0.845245,0.000221,0.024575,3.8e-05,0.000227,6.7e-05,7.6e-05,9.7e-05,7.7e-05,0.00014,0.000167,0.000103,6.1e-05,0.045597,0.000184,7.6e-05,4.1e-05,7.4e-05,4.3e-05,8.7e-05,3.7e-05,6.8e-05,0.000281,1.5e-05,7.6e-05,4.5e-05,5.4e-05,0.000109,2.5e-05,0.000143,6.5e-05,2.3e-05,1.6e-05,0.015275,5e-05,4.2e-05,0.0,0.00014,5.9e-05,8.3e-05,0.0,0.000113,1.1e-05,0.0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


The proportion of each state that is vaccinated may affect the number of people who are infected with the flu. Therefore, we include information on the adult and child vaccination rate for each state.

In [536]:
flu_df = pd.read_csv('flu.csv')

In [537]:
flu_df['State'] = [state2abbrev[state] for state in flu_df.State]

In [538]:
state_stats_df4 = (pd.merge(state_stats_df3, flu_df, left_index=True, right_on='State').drop(columns=['State']))
state_stats_df4.index = state_stats_df3.index

In [539]:
state_stats_df4.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+,spring,summer,fall,winter,partisan_score,hispanic,minority,female,unemployed,income,nodegree,bachelor,inactivity,obesity,cancer,NY_dest,NJ_dest,PA_dest,IL_dest,MD_dest,MA_dest,VA_dest,CA_dest,RI_dest,MI_dest,TX_dest,MO_dest,MN_dest,CT_dest,GA_dest,OH_dest,CO_dest,DE_dest,FL_dest,IN_dest,UT_dest,KY_dest,NE_dest,TN_dest,OR_dest,LA_dest,NC_dest,OK_dest,WA_dest,KS_dest,WI_dest,NH_dest,AZ_dest,SC_dest,AL_dest,IA_dest,NM_dest,WV_dest,NV_dest,AR_dest,ID_dest,ME_dest,MS_dest,VT_dest,SD_dest,ND_dest,MT_dest,WY_dest,NY_is_neighbor,NJ_is_neighbor,PA_is_neighbor,IL_is_neighbor,MD_is_neighbor,MA_is_neighbor,VA_is_neighbor,CA_is_neighbor,RI_is_neighbor,MI_is_neighbor,TX_is_neighbor,MO_is_neighbor,MN_is_neighbor,CT_is_neighbor,GA_is_neighbor,OH_is_neighbor,CO_is_neighbor,DE_is_neighbor,FL_is_neighbor,IN_is_neighbor,UT_is_neighbor,KY_is_neighbor,NE_is_neighbor,TN_is_neighbor,OR_is_neighbor,LA_is_neighbor,NC_is_neighbor,OK_is_neighbor,WA_is_neighbor,KS_is_neighbor,WI_is_neighbor,NH_is_neighbor,AZ_is_neighbor,SC_is_neighbor,AL_is_neighbor,IA_is_neighbor,NM_is_neighbor,WV_is_neighbor,NV_is_neighbor,AR_is_neighbor,ID_is_neighbor,ME_is_neighbor,MS_is_neighbor,VT_is_neighbor,SD_is_neighbor,ND_is_neighbor,MT_is_neighbor,WY_is_neighbor,overall_vacc_rate,child_vacc_rate
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16,43.6,66.5,48.1,23.3,-12,18.737,29.756,51.436,5.347,61.622797,14.719,33.425,24.046,24.675,200.165,0.911538,0.061864,0.004793,0.000216,0.000722,0.001985,0.000495,0.000199,0.001692,0.000196,0.000148,0.000147,0.000207,0.033303,0.000321,0.000261,0.000237,0.002078,0.00057,0.000153,0.000171,0.000122,0.000156,0.000232,9.8e-05,0.000264,0.000368,8.9e-05,0.000126,0.000117,0.000155,0.001047,0.000143,0.000251,0.000122,0.000119,8.8e-05,0.000168,0.000233,9.9e-05,8e-05,0.000857,0.000131,0.012476,8.1e-05,0.0,0.000105,0.000209,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,81.7,69.6
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16,50.6,72.2,54.8,33.0,-7,19.372,27.143,51.199,5.676,72.829584,11.59,36.24,22.909,25.672,200.319,0.028068,0.823014,0.019686,0.000128,0.000793,0.000447,0.000315,7.2e-05,0.000472,8.9e-05,9.9e-05,0.000112,7.9e-05,0.001903,0.000142,0.000171,0.000154,0.016296,0.000245,8.1e-05,6.7e-05,9.1e-05,6.5e-05,9.8e-05,5.2e-05,8e-05,0.000205,5.9e-05,6.7e-05,6e-05,6.9e-05,0.000368,0.000132,0.000183,5.1e-05,1.5e-05,2.5e-05,0.000137,8.1e-05,4.5e-05,4.1e-05,0.000325,3.4e-05,0.000566,0.0,0.0,0.000114,0.0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79.4,72.8
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18,47.4,68.6,50.9,28.4,0,6.767,17.371,51.067,5.199,54.9607,11.11,28.104,23.155,29.331,230.835,0.003142,0.028442,0.877457,0.000199,0.012778,0.000454,0.000731,7.9e-05,0.000352,0.000216,0.000143,0.000188,0.000158,0.000587,0.000222,0.002901,0.000164,0.069599,0.000248,0.000159,0.000125,0.000229,5.8e-05,0.000202,5.3e-05,0.00018,0.000314,0.000203,5.8e-05,0.000106,0.000117,0.000366,0.000156,0.00025,0.000151,9.1e-05,0.0001,0.011997,0.000114,0.000207,6.5e-05,0.000299,0.000101,0.000294,8e-05,0.000137,0.000137,0.000232,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,82.5,69.7
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15,51.6,73.4,53.8,28.3,-7,16.918,22.729,50.902,5.963,59.266824,12.397,32.078,21.199,27.34,225.471,0.000143,0.000187,0.000201,0.900515,0.00015,0.000173,0.000197,0.000141,0.000231,0.000784,0.000194,0.018036,0.000492,0.00021,0.000285,0.000342,0.000297,0.000204,0.000304,0.015667,0.000116,0.001515,0.000491,0.000354,9.2e-05,0.000159,0.000204,0.000159,0.000138,0.000288,0.011498,0.000137,0.000223,0.000114,0.000131,0.018256,7.1e-05,0.000108,0.000336,0.000304,0.00019,9.6e-05,0.000181,0.000142,0.000141,0.000346,0.000173,0.000213,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,83.0,60.1
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15,52.8,73.3,56.1,34.7,-12,9.407,40.312,51.537,5.266,75.296062,11.089,37.129,21.545,28.94,225.486,0.000215,0.000521,0.005808,6.8e-05,0.845245,0.000221,0.024575,3.8e-05,0.000227,6.7e-05,7.6e-05,9.7e-05,7.7e-05,0.00014,0.000167,0.000103,6.1e-05,0.045597,0.000184,7.6e-05,4.1e-05,7.4e-05,4.3e-05,8.7e-05,3.7e-05,6.8e-05,0.000281,1.5e-05,7.6e-05,4.5e-05,5.4e-05,0.000109,2.5e-05,0.000143,6.5e-05,2.3e-05,1.6e-05,0.015275,5e-05,4.2e-05,0.0,0.00014,5.9e-05,8.3e-05,0.0,0.000113,1.1e-05,0.0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,81.9,74.5


Smoking may also affect suspectibility to viruses such as the flu and Covid-19, so we include a feature that reports the fraction of adults who smoke in each state.

In [541]:
state2smoking_rate = {
        'AL': 20.9,
        'AR': 22.3,
        'AZ': 15.6,
        'CA': 11.3,
        'CO': 14.6,
        'CT': 12.7,
        'DE': 17.0,
        'FL': 16.1,
        'GA': 17.5,
        'IA': 17.1,
        'ID': 14.3,
        'IL': 15.5,
        'IN': 21.8,
        'KS': 17.4,
        'KY': 24.6,
        'LA': 23.1,
        'MA': 13.7,
        'MD': 13.8,
        'ME': 17.3,
        'MI': 19.3,
        'MN': 14.5,
        'MO': 20.8,
        'MS': 22.2,
        'MT': 17.2,
        'NC': 17.2,
        'ND': 18.3,
        'NE': 15.4,
        'NH': 15.7,
        'NJ': 13.7,
        'NM': 17.5,
        'NV': 17.6,
        'NY': 14.1,
        'OH': 21.1,
        'OK': 20.1,
        'OR': 16.1,
        'PA': 18.7,
        'RI': 14.9,
        'SC': 18.8,
        'SD': 19.3,
        'TN': 22.6,
        'TX': 15.7,
        'UT': 8.9,
        'VA': 16.4,
        'VT': 15.8,
        'WA': 13.5,
        'WI': 16,
        'WV': 26,
        'WY': 18.7
}

In [542]:
state_stats_df4['smoking_rate'] = [state2smoking_rate[state] / 100 for state in state_stats_df4.index]

In [543]:
state_stats_df4.to_csv('state_stats.csv', index_label=False)