## Combining Demographic Data with Johns Hopkins COVID-19 Data

This is the second notebook in data set creation, and uses the output from demographic_data.ipynb.

COVID-19 data was downloaded from https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data:
+ US county-level confirmed cases data: https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
+ US county-level deaths data: https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv

Additionally, since the death and confirmed case numbers for the counties that make up New York City are combined under New York County in the Johns Hopkins dataset, I manually updated the confirmed case and death numbers for each of the 5 counties (New York, Queens, Kings, Richmond, and Bronx counties) with data released by the NYC Health department for the individual dates used.
+ 03/24/20 (earliest date available. data from 03/24, published on 03/25): cases- https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-03252020-1.pdf; deaths- https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-03252020-1.pdf
+ 04/23/20: cases (data from 04/23, published on 04/24)- https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04242020-1.pdf; deaths- https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04242020-1.pdf
+ 5/11/20: cases , deaths: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-05122020-1.pdf

In [1]:
import pandas as pd
from datetime import date
from datetime import datetime
import re

In [2]:
pd.set_option('display.expand_frame_repr', False) # the frame will be huge, don't expand
pd.set_option('display.precision', 4)

In [3]:
demo = pd.read_csv("counties.csv", dtype={'FIPS':float})
# looks like JH data doesn't have leading zeros in FIPS codes
confirmed = pd.read_csv("time_series_covid19_confirmed_US_20200512.csv")
deaths = pd.read_csv("time_series_covid19_deaths_US_20200512.csv")

In [4]:
demo.head()

Unnamed: 0.1,Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,empl_transp_utilities,employed,...,nh_weighted_health_score,nh_num_beds,nh_prc_occupied,nh_nurse_hours,nh_total_fines,nh_overall_rating,incarcerated,domestic_passengers,intl_passengers,order started
0,"Morgan County, Alabama: Summary level: 050, st...",2.56,1.0792,11.1812,19.4094,8.3231,22.2135,10.2471,5.6548,53742,...,22.7825,634.0,93.6435,3.776,11365.9164,3.0521,604.0,580000,0,04/04/20
1,"Kings County, California: Summary level: 050, ...",3.15,14.8108,7.4102,21.6017,8.9412,7.1271,9.3059,4.4241,52644,...,65.9978,327.0,77.8899,4.1673,0.0,3.3792,465.0,0,0,03/19/20
2,"Monterey County, California: Summary level: 05...",3.31,15.99,10.0846,19.6731,10.8732,6.5126,8.9714,3.8882,190707,...,62.9447,993.0,86.1531,4.1465,9445.8439,3.6959,929.0,186000,0,03/19/20
3,"Nevada County, California: Summary level: 050,...",2.37,1.3392,16.3689,20.6696,11.5335,3.5097,10.5988,6.7902,44505,...,46.2379,429.0,86.1305,4.4692,1140.6667,4.6317,197.0,0,0,03/19/20
4,"Shasta County, California: Summary level: 050,...",2.59,1.0668,9.3942,25.462,11.4847,4.4179,12.8545,5.008,69649,...,73.238,1055.0,84.3791,4.5562,23127.3251,3.7697,339.0,0,0,03/19/20


In [5]:
confirmed.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,145,145,145,145,149,149,151,151,151,151
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,14,14,14,14,15,15,15,16,16,19
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,1757,1808,1843,1924,1968,2031,2156,2173,2198,2256
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,66,66,66,66,66,66,68,68,69,69


In [6]:
last_date = confirmed.columns.values[-1]
last_date

'5/11/20'

In [7]:
# get date of at least 10 confirmed cases...
date_10_cases = list()

for x in range(confirmed.shape[0]):
    trans = confirmed.iloc[x].T
    trans = trans.iloc[11:] # just use the date fields
    trans = trans[trans >= 10]
    
    if len(trans) > 0:
        date_10_cases.append(trans.keys()[0])
    else:
        date_10_cases.append(last_date) # if county hasn't yet reached 10 cases, use the latest date in the data

In [8]:
confirmed["ten plus cases"] = date_10_cases

In [9]:
confirmed.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20,ten plus cases
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,5/11/20
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,145,145,145,149,149,151,151,151,151,3/19/20
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,14,14,14,15,15,15,16,16,19,4/8/20
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,1808,1843,1924,1968,2031,2156,2173,2198,2256,3/20/20
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,66,66,66,66,66,68,68,69,69,3/24/20


In [10]:
confirmed[(confirmed["Province_State"] == "New York") & 
         ((confirmed["Admin2"] == "Bronx") | (confirmed["Admin2"] == "Kings") |
          (confirmed["Admin2"] == "New York") | (confirmed["Admin2"] == "Queens") |
          (confirmed["Admin2"] == "Richmond"))]
# looks like NYC counties (bronx, kings, queens, etc. all reported under New York County

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20,ten plus cases
1835,84036005,US,USA,840,36005.0,Bronx,New York,US,40.8521,-73.8628,...,0,0,0,0,0,0,0,0,0,5/11/20
1856,84036047,US,USA,840,36047.0,Kings,New York,US,40.6362,-73.9494,...,0,0,0,0,0,0,0,0,0,5/11/20
1863,84036061,US,USA,840,36061.0,New York,New York,US,40.7673,-73.9715,...,174331,175651,176874,178351,180216,181783,183289,184417,185357,3/6/20
1873,84036081,US,USA,840,36081.0,Queens,New York,US,40.7109,-73.8168,...,0,0,0,0,0,0,0,0,0,5/11/20
1875,84036085,US,USA,840,36085.0,Richmond,New York,US,40.5858,-74.1481,...,0,0,0,0,0,0,0,0,0,5/11/20


In [11]:
deaths.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,5,5,5,5,5,5,5,5,5,5
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,2,2,2,2,2,2,2,2,2,2
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,95,97,97,99,99,102,107,108,111,113
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,4,4,4,4,4,4,4,4,4,5


In [12]:
deaths.shape

(3261, 123)

In [13]:
deaths.columns

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_',
       ...
       '5/2/20', '5/3/20', '5/4/20', '5/5/20', '5/6/20', '5/7/20', '5/8/20',
       '5/9/20', '5/10/20', '5/11/20'],
      dtype='object', length=123)

In [14]:
# nyc_counties = ["Bronx", "Kings", "New York", "Queens", "Richmond"]
deaths[(deaths["Province_State"] == "New York") & ((deaths["Admin2"] == "Bronx") | (deaths["Admin2"] == "Kings") |
                                                  (deaths["Admin2"] == "New York") | (deaths["Admin2"] == "Queens") |
                                                  (deaths["Admin2"] == "Richmond"))]
# it appears that all 5 counties in NYC are all being reported en masse as New York County
# SO will need to aggregate some data for NYC instead of leaving each of these counties individually in the data

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20
1835,84036005,US,USA,840,36005.0,Bronx,New York,US,40.8521,-73.8628,...,0,0,0,0,0,0,0,0,0,0
1856,84036047,US,USA,840,36047.0,Kings,New York,US,40.6362,-73.9494,...,0,0,0,0,0,0,0,0,0,0
1863,84036061,US,USA,840,36061.0,New York,New York,US,40.7673,-73.9715,...,18491,18925,19057,19067,19174,19626,19561,19818,19789,20056
1873,84036081,US,USA,840,36081.0,Queens,New York,US,40.7109,-73.8168,...,0,0,0,0,0,0,0,0,0,0
1875,84036085,US,USA,840,36085.0,Richmond,New York,US,40.5858,-74.1481,...,0,0,0,0,0,0,0,0,0,0


In [15]:
merged = pd.merge(demo, deaths, how='inner', on="FIPS", 
                  left_index=False, right_index=False)

In [16]:
merged.head()

Unnamed: 0.1,Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,empl_transp_utilities,employed,...,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20
0,"Morgan County, Alabama: Summary level: 050, st...",2.56,1.0792,11.1812,19.4094,8.3231,22.2135,10.2471,5.6548,53742,...,0,0,0,0,0,0,0,0,0,0
1,"Kings County, California: Summary level: 050, ...",3.15,14.8108,7.4102,21.6017,8.9412,7.1271,9.3059,4.4241,52644,...,1,1,1,1,1,1,1,1,1,1
2,"Monterey County, California: Summary level: 05...",3.31,15.99,10.0846,19.6731,10.8732,6.5126,8.9714,3.8882,190707,...,5,6,6,6,6,6,6,6,6,6
3,"Nevada County, California: Summary level: 050,...",2.37,1.3392,16.3689,20.6696,11.5335,3.5097,10.5988,6.7902,44505,...,1,1,1,1,1,1,1,1,1,1
4,"Shasta County, California: Summary level: 050,...",2.59,1.0668,9.3942,25.462,11.4847,4.4179,12.8545,5.008,69649,...,4,4,4,4,4,4,4,4,4,4


In [17]:
merged.describe()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,empl_transp_utilities,employed,prc_fam_poverty,...,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20
count,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,...,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0
mean,2.5905,1.9294,10.1016,23.9089,9.6716,11.0223,11.5296,5.3078,163970.0,9.0312,...,76.0713,77.5369,78.8271,81.2648,83.8888,86.3567,88.0556,89.8609,90.6904,91.9528
std,0.2503,2.7136,3.5975,4.7552,2.7863,5.855,2.02,1.913,293100.0,4.2493,...,664.3899,679.6519,684.8331,686.607,692.3093,708.7927,707.7874,717.5306,717.3804,726.9738
min,1.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17763.0,1.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.41,0.5405,7.6825,20.9051,7.968,6.7159,10.2838,3.9946,44176.0,5.9,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2.55,1.0454,9.5856,23.2224,9.3013,10.0312,11.4295,5.0199,75494.0,8.4,...,8.0,8.0,8.0,8.0,9.0,9.0,10.0,10.0,10.0,10.0
75%,2.71,2.2877,11.926,26.3873,10.8736,14.3158,12.6416,6.3624,168630.0,11.4,...,26.5,27.0,28.0,28.0,30.0,31.0,31.5,33.0,34.0,34.0
max,4.11,25.7236,30.1044,46.2286,31.9878,43.88,21.3085,17.1676,5001400.0,29.4,...,18491.0,18925.0,19057.0,19067.0,19174.0,19626.0,19561.0,19818.0,19789.0,20056.0


In [18]:
merged.columns.values

array(['Unnamed: 0', 'household_size', 'empl_agriculture',
       'empl_professional', 'empl_social', 'empl_services',
       'empl_manufacturing', 'empl_retail', 'empl_transp_utilities',
       'employed', 'prc_fam_poverty', 'avg_income', 'prc_public_transp',
       'population', 'pop_65_plus', 'health_ins', 'aa_and_mixed', 'aa',
       'latino', 'immigrant', 'county', 'state', 'FIPS', 'area',
       'prc_obese', 'nh_weighted_health_score', 'nh_num_beds',
       'nh_prc_occupied', 'nh_nurse_hours', 'nh_total_fines',
       'nh_overall_rating', 'incarcerated', 'domestic_passengers',
       'intl_passengers', 'order started', 'UID', 'iso2', 'iso3', 'code3',
       'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_',
       'Combined_Key', 'Population', '1/22/20', '1/23/20', '1/24/20',
       '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20', '1/30/20',
       '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20

In [19]:
merged.shape

(827, 157)

In [20]:
merged[["population", "Population"]] 
# ACS population and that used by JH data very close, though not exactly the same
# just use ACS population for consistency
# ultimately drop columns 'UID', 'iso2', 'iso3', 'code3', 'Admin2', 'Province_State',
#        'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population', "Unnamed: 0"
# and drop/ignore for model "county", "state", "FIPS"

Unnamed: 0,population,Population
0,119089,119679
1,151366,152940
2,435594,434061
3,99696,99755
4,180040,180080
...,...,...
822,814901,822083
823,85129,84769
824,948201,945726
825,187365,187885


In [21]:
merged = merged.drop(['UID', 'iso2', 'iso3', 'code3', 'Admin2', 'Province_State','Country_Region', 
                      'Lat', 'Long_', 'Combined_Key', 'Population', "Unnamed: 0"], axis=1)

In [22]:
confirmed[["3/24/20", "4/1/20", "4/11/20", "4/23/20", "5/11/20"]].describe() 
# in first pass, found that number of ases 2 weeks prior was almost perfectly correlated with 
# number of deaths, so will just use the count of confirmed cases 4 weeks prior.  I would
# have used earlier data, too, if I could get the break-down by county for NYC

Unnamed: 0,3/24/20,4/1/20,4/11/20,4/23/20,5/11/20
count,3261.0,3261.0,3261.0,3261.0,3261.0
mean,16.4784,65.3916,161.4216,266.5348,413.3336
std,280.9777,907.7799,1924.1117,2888.5484,3804.3598
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,2.0,4.0
50%,0.0,2.0,7.0,12.0,23.0
75%,2.0,11.0,31.0,59.0,107.0
max,14904.0,47439.0,98308.0,145855.0,185357.0


In [23]:
# add the date of 10+ confirmed deaths as a column
confirmed = confirmed[["FIPS", "3/23/20", "4/1/20", "4/11/20", "4/23/20", "4/25/20", "5/11/20", "ten plus cases"]]
confirmed.columns = ["FIPS", "cases_march24", "cases_april1", "cases_april11", "cases_april23", "cases_april25", "cases_may11", "ten plus cases"]

In [24]:
merged = pd.merge(merged, confirmed, how='inner', on="FIPS",
                 left_index=False, right_index=False)

In [25]:
merged.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,empl_transp_utilities,employed,prc_fam_poverty,...,5/9/20,5/10/20,5/11/20,cases_march24,cases_april1,cases_april11,cases_april23,cases_april25,cases_may11,ten plus cases
0,2.56,1.0792,11.1812,19.4094,8.3231,22.2135,10.2471,5.6548,53742,9.9,...,0,0,0,0,19,37,50,55,95,3/28/20
1,3.15,14.8108,7.4102,21.6017,8.9412,7.1271,9.3059,4.4241,52644,15.6,...,1,1,1,0,4,9,35,47,319,4/12/20
2,3.31,15.99,10.0846,19.6731,10.8732,6.5126,8.9714,3.8882,190707,10.5,...,6,6,6,14,42,79,154,169,279,3/21/20
3,2.37,1.3392,16.3689,20.6696,11.5335,3.5097,10.5988,6.7902,44505,5.1,...,1,1,1,2,26,34,36,37,41,3/28/20
4,2.59,1.0668,9.3942,25.462,11.4847,4.4179,12.8545,5.008,69649,9.5,...,4,4,4,2,7,24,28,30,32,4/3/20


In [26]:
merged.shape

(827, 152)

#### Manually Update Numbers for NYC Counties

Combining the data for the 5 NYC counties since infections and data is only being reported through New York County instead of each county (New York County, Bronx County, Kings County, Queens County, Richmond County) individually

In [27]:
merged[(merged["state"] == "NY") & ((merged["county"] == "Bronx County") | 
                                   (merged["county"] == "Kings County") |
                                   (merged["county"] == "New York County") | 
                                   (merged["county"] == "Queens County") |
                                   (merged["county"] == "Richmond County"))][["FIPS", "county", "state",
                                                                              "ten plus cases", "3/24/20",
                                                                              "4/1/20", "4/11/20", "4/23/20", 
                                                                              "4/25/20", "5/11/20", 
                                                                              "cases_march24", "cases_april1", 
                                                                              "cases_april11", "cases_april23", 
                                                                              "cases_april25", "cases_may11"]]

Unnamed: 0,FIPS,county,state,ten plus cases,3/24/20,4/1/20,4/11/20,4/23/20,4/25/20,5/11/20,cases_march24,cases_april1,cases_april11,cases_april23,cases_april25,cases_may11
284,36005.0,Bronx County,NY,5/11/20,0,0,0,0,0,0,0,0,0,0,0,0
489,36061.0,New York County,NY,3/6/20,433,3115,10238,16158,17126,20056,12305,47439,98308,145855,155113,185357
588,36047.0,Kings County,NY,5/11/20,0,0,0,0,0,0,0,0,0,0,0,0
688,36081.0,Queens County,NY,5/11/20,0,0,0,0,0,0,0,0,0,0,0,0
689,36085.0,Richmond County,NY,5/11/20,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# update so all have same "ten plus cases" of 3/6/20
for a in [284, 588, 688, 689]:
    merged.at[a,'ten plus cases'] = "3/6/20"

In [29]:
merged[(merged["state"] == "NY") & ((merged["county"] == "Bronx County") | 
                                   (merged["county"] == "Kings County") |
                                   (merged["county"] == "New York County") | 
                                   (merged["county"] == "Queens County") |
                                   (merged["county"] == "Richmond County"))][["FIPS", "county", "state",
                                                                              "ten plus cases", "3/24/20",
                                                                              "4/1/20", "4/11/20", "4/23/20", 
                                                                              "4/25/20", "5/11/20", 
                                                                              "cases_march24", "cases_april1", 
                                                                              "cases_april11", "cases_april23", 
                                                                              "cases_april25", "cases_may11"]]

Unnamed: 0,FIPS,county,state,ten plus cases,3/24/20,4/1/20,4/11/20,4/23/20,4/25/20,5/11/20,cases_march24,cases_april1,cases_april11,cases_april23,cases_april25,cases_may11
284,36005.0,Bronx County,NY,3/6/20,0,0,0,0,0,0,0,0,0,0,0,0
489,36061.0,New York County,NY,3/6/20,433,3115,10238,16158,17126,20056,12305,47439,98308,145855,155113,185357
588,36047.0,Kings County,NY,3/6/20,0,0,0,0,0,0,0,0,0,0,0,0
688,36081.0,Queens County,NY,3/6/20,0,0,0,0,0,0,0,0,0,0,0,0
689,36085.0,Richmond County,NY,3/6/20,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
# deaths for 4/23: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04242020-1.pdf
merged.at[284, "4/23/20"] = 2342 
merged.at[489, "4/23/20"] = 1390
merged.at[588, "4/23/20"] = 3190
merged.at[688, "4/23/20"] = 3304
merged.at[689, "4/23/20"] = 515

In [31]:
# deaths for 4/1: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04022020-1.pdf
merged.at[284, "4/1/20"] = 382
merged.at[489, "4/1/20"] = 165
merged.at[588, "4/1/20"] = 328
merged.at[688, "4/1/20"] = 448
merged.at[689, "4/1/20"] = 67

In [32]:
# deaths for 3/24: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-03252020-1.pdf
merged.at[284, "3/24/20"] = 43
merged.at[489, "3/24/20"] = 30
merged.at[588, "3/24/20"] = 43
merged.at[688, "3/24/20"] = 65
merged.at[689, "3/24/20"] = 18

In [33]:
# deaths for 4/11: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04122020-1.pdf
merged.at[284, "4/11/20"] = 1350
merged.at[489, "4/11/20"] = 664
merged.at[588, "4/11/20"] = 1620
merged.at[688, "4/11/20"] = 1849
merged.at[689, "4/11/20"] = 305

In [34]:
# deaths for 4/25: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04262020-1.pdf
merged.at[284, "4/25/20"] = 2480
merged.at[489, "4/25/20"] = 1487
merged.at[588, "4/25/20"] = 3420
merged.at[688, "4/25/20"] = 3511
merged.at[689, "4/25/20"] = 556

In [35]:
# deaths for 5/11: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-05122020-1.pdf
merged.at[284, "5/11/20"] = 3247
merged.at[489, "5/11/20"] = 2000
merged.at[588, "5/11/20"] = 4596
merged.at[688, "5/11/20"] = 4529
merged.at[689, "5/11/20"] = 721

In [36]:
merged[(merged["state"] == "NY") & ((merged["county"] == "Bronx County") | 
                                   (merged["county"] == "Kings County") |
                                   (merged["county"] == "New York County") | 
                                   (merged["county"] == "Queens County") |
                                   (merged["county"] == "Richmond County"))][["FIPS", "county", "state",
                                                                              "ten plus cases", "3/24/20",
                                                                              "4/1/20", "4/11/20", "4/23/20", 
                                                                              "4/25/20", "5/11/20", 
                                                                              "cases_march24", "cases_april1", 
                                                                              "cases_april11", "cases_april23", 
                                                                              "cases_april25", "cases_may11"]]

Unnamed: 0,FIPS,county,state,ten plus cases,3/24/20,4/1/20,4/11/20,4/23/20,4/25/20,5/11/20,cases_march24,cases_april1,cases_april11,cases_april23,cases_april25,cases_may11
284,36005.0,Bronx County,NY,3/6/20,43,382,1350,2342,2480,3247,0,0,0,0,0,0
489,36061.0,New York County,NY,3/6/20,30,165,664,1390,1487,2000,12305,47439,98308,145855,155113,185357
588,36047.0,Kings County,NY,3/6/20,43,328,1620,3190,3420,4596,0,0,0,0,0,0
688,36081.0,Queens County,NY,3/6/20,65,448,1849,3304,3511,4529,0,0,0,0,0,0
689,36085.0,Richmond County,NY,3/6/20,18,67,305,515,556,721,0,0,0,0,0,0


"cases_march24", "cases_april1", "cases_april11", 
"cases_april23",  "cases_april25", "cases_may11"

In [37]:
# cases march 24: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-03252020-1.pdf

merged.at[284, "cases_march24"] = 2789
merged.at[489, "cases_march24"] = 3187
merged.at[588, "cases_march24"] = 4656
merged.at[688, "cases_march24"] = 5066
merged.at[689, "cases_march24"] = 1084

In [38]:
# cases april 1: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04022020-1.pdf

merged.at[284, "cases_april1"] = 9107
merged.at[489, "cases_april1"] = 7278
merged.at[588, "cases_april1"] = 12983
merged.at[688, "cases_april1"] = 16336
merged.at[689, "cases_april1"] = 2723

In [39]:
# cases april 23: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04242020-1.pdf

merged.at[284, "cases_april23"] = 32862
merged.at[489, "cases_april23"] = 18252
merged.at[588, "cases_april23"] = 38727
merged.at[688, "cases_april23"] = 45313
merged.at[689, "cases_april23"] = 10917

In [40]:
# cases april 11: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04122020-1.pdf

merged.at[284, "cases_april11"] = 22325
merged.at[489, "cases_april11"] = 13280
merged.at[588, "cases_april11"] = 25956
merged.at[688, "cases_april11"] = 32123
merged.at[689, "cases_april11"] = 7784

In [41]:
# cases april 25: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04262020-1.pdf

merged.at[284, "cases_april25"] = 34711
merged.at[489, "cases_april25"] = 19046
merged.at[588, "cases_april25"] = 40593
merged.at[688, "cases_april25"] = 47511
merged.at[689, "cases_april25"] = 11275

In [42]:
# cases may 11: https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-05122020-1.pdf

merged.at[284, "cases_may11"] = 41746
merged.at[489, "cases_may11"] = 22771
merged.at[588, "cases_may11"] = 50079
merged.at[688, "cases_may11"] = 56899
merged.at[689, "cases_may11"] = 12733

In [43]:
merged[(merged["state"] == "NY") & ((merged["county"] == "Bronx County") | 
                                   (merged["county"] == "Kings County") |
                                   (merged["county"] == "New York County") | 
                                   (merged["county"] == "Queens County") |
                                   (merged["county"] == "Richmond County"))][["FIPS", "county", "state",
                                                                              "ten plus cases", "3/24/20",
                                                                              "4/1/20", "4/11/20", "4/23/20", 
                                                                              "4/25/20", "5/11/20", 
                                                                              "cases_march24", "cases_april1", 
                                                                              "cases_april11", "cases_april23", 
                                                                              "cases_april25", "cases_may11"]]

Unnamed: 0,FIPS,county,state,ten plus cases,3/24/20,4/1/20,4/11/20,4/23/20,4/25/20,5/11/20,cases_march24,cases_april1,cases_april11,cases_april23,cases_april25,cases_may11
284,36005.0,Bronx County,NY,3/6/20,43,382,1350,2342,2480,3247,2789,9107,22325,32862,34711,41746
489,36061.0,New York County,NY,3/6/20,30,165,664,1390,1487,2000,3187,7278,13280,18252,19046,22771
588,36047.0,Kings County,NY,3/6/20,43,328,1620,3190,3420,4596,4656,12983,25956,38727,40593,50079
688,36081.0,Queens County,NY,3/6/20,65,448,1849,3304,3511,4529,5066,16336,32123,45313,47511,56899
689,36085.0,Richmond County,NY,3/6/20,18,67,305,515,556,721,1084,2723,7784,10917,11275,12733


remove all the unused date (death counts) columns

In [44]:
to_drop = ['1/22/20', '1/23/20','1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
           '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20',
           '2/5/20', '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20',
           '2/11/20', '2/12/20', '2/13/20', '2/14/20', '2/15/20', '2/16/20',
           '2/17/20', '2/18/20', '2/19/20', '2/20/20', '2/21/20', '2/22/20',
           '2/23/20', '2/24/20', '2/25/20', '2/26/20', '2/27/20', '2/28/20', 
           '2/29/20', '3/1/20', '3/2/20', '3/3/20', '3/4/20', '3/5/20',
           '3/6/20', '3/7/20', '3/8/20', '3/9/20', '3/10/20', '3/11/20',
           '3/12/20', '3/13/20', '3/14/20', '3/15/20', '3/16/20', '3/17/20',
           '3/18/20', '3/19/20', '3/20/20', '3/21/20', '3/22/20', '3/23/20', 
           '3/25/20', '3/26/20', '3/27/20', '3/28/20', '3/29/20',
           '3/30/20', '3/31/20', '4/2/20', '4/3/20', '4/4/20',
           '4/5/20', '4/6/20', '4/7/20', '4/8/20', '4/9/20', '4/10/20',
           '4/12/20', '4/13/20', '4/14/20', '4/15/20', '4/16/20',
           '4/17/20', '4/18/20', '4/19/20', '4/20/20', '4/21/20', '4/22/20', 
           '4/24/20', '4/26/20', '4/27/20', '4/28/20', '4/29/20', '4/30/20', 
           '5/1/20', '5/2/20', '5/3/20', '5/4/20', '5/5/20', '5/6/20', '5/7/20',
           '5/8/20', '5/9/20', '5/10/20']
merged.drop(to_drop, axis=1, inplace=True)

In [45]:
merged.columns.values

array(['household_size', 'empl_agriculture', 'empl_professional',
       'empl_social', 'empl_services', 'empl_manufacturing',
       'empl_retail', 'empl_transp_utilities', 'employed',
       'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'population',
       'pop_65_plus', 'health_ins', 'aa_and_mixed', 'aa', 'latino',
       'immigrant', 'county', 'state', 'FIPS', 'area', 'prc_obese',
       'nh_weighted_health_score', 'nh_num_beds', 'nh_prc_occupied',
       'nh_nurse_hours', 'nh_total_fines', 'nh_overall_rating',
       'incarcerated', 'domestic_passengers', 'intl_passengers',
       'order started', '3/24/20', '4/1/20', '4/11/20', '4/23/20',
       '4/25/20', '5/11/20', 'cases_march24', 'cases_april1',
       'cases_april11', 'cases_april23', 'cases_april25', 'cases_may11',
       'ten plus cases'], dtype=object)

In [46]:
merged.to_csv("../combined_data.csv")