# Urban scaling and coronavirus #

This notebook is a replication attempt of Stier et al.'s preprint on Arxiv [1]. This notebook brings together MSA definitions and census data to allow demographic calculations for MSAs in relation to the coronavirus outbreak. The MSAs are a county-level unit delineated by the Census Bureau (see https://www.census.gov/programs-surveys/metro-micro/about/delineation-files.html). The coronavirus outbreak data are provided by USAFacts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/, last downloaded March 25th, 2020 14:34 EDT).

.. [1] Andrew J Stier, Marc G. Berman, and Luis M. A. Bettencourt. March 23rd, 2020. COVID-19 attack rate increases with city size. arXiv:2003.10376v1 [q-bio.PE]


In [None]:
#import key packages
import numpy as np
import pandas
import datetime #the covid dataset uses datetimes as column indices
import scipy.stats
import matplotlib.pyplot as plt

In [None]:
#Load the datasets: the MSA delineations, the ACS 2018 population estimates, and the USAFacts Coronavirus dataset
msas = pandas.read_excel(r'data/USCensus/Delineations/list1_Sep_2018.xls',header=2)
pop = pandas.read_excel(r'data/USCensus/Population/co-est2018-alldata.xls')
covid = pandas.read_excel(r'data/Coronavirus/USAFacts/covid_confirmed_usafacts.xls')
deaths = pandas.read_excel(r'data/Coronavirus/USAFacts/covid_deaths_usafacts.xls')

In [None]:
#Some tests for exploring the data, feel free to run to get a sense of what these tables contain.
#print(msas.iloc[0])
#print(pop.iloc[0])
#print(covid.iloc[0])
#msas['CBSA Code'].unique()

In [None]:
#Assemble the state, metro/micro areas, pop data, and covid cases
data = []
popnotfound = []
covidnotfound = []
covidnotincl = []
dateStart = datetime.datetime(2020, 3, 13, 0, 0)
dateEnd = datetime.datetime(2020, 3, 19, 0, 0) #INCLUSIVE; note this date was chosen for comparability with the arxiv preprint
days = (dateEnd - dateStart).days+1

col_names = ['CBSA','Title','MetroMicro','Pop2018','COVIDEnd','AttackRate','r']
#We have to leave out Puerto Rico because it is not in the census data; I'm sorry!
for cbsa in msas.loc[(msas['FIPS State Code'] != 72) & (msas.index <= 1914)]['CBSA Code'].unique():
    #Get the MSA information
    #cbsa = '10740' #ABQ metro for testing
    counties = msas.loc[msas['CBSA Code'] == cbsa]
    row = [cbsa,counties.loc[counties.index[0]]['CBSA Title'],counties.loc[counties.index[0]]['Metropolitan/Micropolitan Statistical Area']]
    #for all state and county codes, go through and select the relevant pop data
    pop_total = 0
    covid_last = 0
    covid_series = [0]*days #This stores just cases, not people who died
    #Loop through every constituent county to get the population as well as the COVID cases
    for s, c in zip(counties['FIPS State Code'],counties['FIPS County Code']):
        fips = int(s*1000 + c) #str(int(s)) + '0'*(3-len(str(int(c))))+str(int(c))
        if any((pop.STATE == int(s)) & (pop.COUNTY == int(c))):
            pop_total += int(pop.loc[(pop.STATE == int(s)) & (pop.COUNTY == int(c))]['POPESTIMATE2018'])
        else:
            print(str(fips) + ' was not found in the ACS data.')
            popnotfound.append(fips)
        if any(fips == covid.countyFIPS):
            covid_last += int(sum(covid.loc[(covid.countyFIPS == fips)][dateEnd]))-int(sum(deaths.loc[(deaths.countyFIPS == fips)][dateEnd]))
            for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)):
                covid_series[i] += int(sum(covid.loc[(covid.countyFIPS == fips)][d])) - int(sum(deaths.loc[(deaths.countyFIPS == fips)][d]))
        else:
            #print(str(fips) + ' was not found in the COVID data.')
            covidnotfound.append(fips)
    row.append(pop_total)
    row.append(covid_last)
    #Now calculate the r
    if (covid_series[-1] <= 3) or (covid_series[0]<=0):
        print(row[1] + ' had too few cases for inclusion')
        covidnotincl.append(cbsa)
        row.append(np.nan)
        row.append(np.nan)
    else:
        #normalize the covid_series so that the March 13th data is 1
        covid_series = [cs * 1. / covid_series[0] for cs in covid_series]
        #run a regression
        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series])
        row.append(slope)
        row.append(r_value)
    data.append(row)
df = pandas.DataFrame(data,columns=col_names)



In [None]:
plt.hist(df.r[pandas.isna(df.r) == False])
plt.title('Correlations are overall pretty good for the city-by-city estimates')
plt.show()

In [None]:
#Plot the chart from figure 1a from the arxiv preprint
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(2,1,1)
points, = ax.plot([np.log(x) for x in df.Pop2018],[np.log(y) for y in df.AttackRate],'bo')

#Now run a linear regression 
#Only include those where an attack rate could be esitmated and its a Metropolitan Area (not Micro)
which = df.index[(pandas.isna(df.AttackRate)==False) & (df.MetroMicro == 'Metropolitan Statistical Area')] 
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress([np.log(x) for x in df.Pop2018[which]],[np.log(y) for y in df.AttackRate[which]])
line, = ax.plot([np.log(x) for x in df.Pop2018],[np.log(x)*slope + intercept for x in df.Pop2018],'k-')
plt.xlabel('log(MSA Population)')
plt.ylabel('log(Estimated attack rate)')
plt.title('Coronavirus attack rate correlation with population size')
print('Correlation: %f, p-value: %f, slope: %f' % (r_value,p_value,slope))