In [1]:
import pandas as pd
from datetime import date, timedelta
import json

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
state = 'Ohio'
state_fips = '39'
today = str(date.today())
print("Today's date:", today)

cnty_join = pd.read_csv('../data/tl_2019_us_county.csv',dtype={'STATEFP':str,'COUNTYFP':str,'GEOID':str})
cnty_join = cnty_join.loc[cnty_join['STATEFP'] == state_fips]
cnty_join = cnty_join[['STATEFP','GEOID','NAMELSAD']]

cnty_join['NAMELSAD'] = cnty_join['NAMELSAD'].str.replace(' County','')

case_data = pd.read_csv('../../data/case-data/oh/20200422-ohio-COVIDSummaryData.csv',thousands=',')
state_cases = case_data
state_cases['date'] = pd.to_datetime(state_cases['Onset Date'], format='%m/%d/%Y', errors='coerce')
state_cases = state_cases.merge(cnty_join,how='left',left_on='County',right_on='NAMELSAD')

cases_by_day = state_cases.groupby('date').agg({'Case Count':'sum','Death Count':'sum'}).reset_index()
most_recent = cases_by_day.sort_values('date',ascending=False).iloc[0]
most_recent_date = most_recent['date']

print('Total cases reported for',state,':',cases_by_day['Case Count'].sum())
print('Total deaths reported for',state,':',cases_by_day['Death Count'].sum())
print('Most recent case confirmation date:',most_recent_date)
display(cases_by_day)

Today's date: 2020-04-22
Total cases reported for Ohio : 14117
Total deaths reported for Ohio : 610
Most recent case confirmation date: 2020-04-21 00:00:00


Unnamed: 0,date,Case Count,Death Count
0,2020-02-12,1,0
1,2020-02-15,2,0
2,2020-02-16,2,0
3,2020-02-19,1,0
4,2020-02-20,1,0
5,2020-02-21,2,0
6,2020-02-23,1,0
7,2020-02-24,1,0
8,2020-02-25,4,0
9,2020-02-26,2,0


## County + day dataset

In addition to the above cases per county per day, we need to record county + days that have no cases or deaths so we can get a good time series.

In [3]:
covid_start = date(2020, 1, 21)   # start date
delta = most_recent_date.date() - covid_start       # as timedelta
delta

days = []
for i in range(delta.days + 1):
    day = covid_start + timedelta(days=i)
    days.append(day)
    
fullDateTime = pd.DataFrame(columns=['date','fips','county'])
for day in days:
    for index, row in cnty_join.iterrows():
        fullDateTime = fullDateTime.append({'date': day, 
                                            'fips': row['GEOID'], 
                                            'county': row['NAMELSAD']}, ignore_index=True)

fullDateTime['date'] = pd.to_datetime(fullDateTime['date'], format='%Y-%m-%d', errors='coerce')

fullDateTime['date_str'] = fullDateTime['date'].dt.strftime('%Y-%m-%d')
fullDateTime['join_field'] = fullDateTime['date_str'] + '-' + fullDateTime['fips']

by_county = state_cases.groupby(['GEOID','date']).agg({'Case Count':'sum','Death Count':'sum'}).reset_index()
by_county['date_str'] = by_county['date'].dt.strftime('%Y-%m-%d')
by_county['join_field'] = by_county['date_str'] + '-' + by_county['GEOID']

fullDT_joined = fullDateTime.merge(by_county,how='left',on='join_field')
fullDT_joined = fullDT_joined.fillna(0)

#do this after you merge to ery day, ery county
fullDT_joined = fullDT_joined.sort_values(['fips','date_x'])
fullDT_joined['cases'] = fullDT_joined.groupby('fips')['Case Count'].transform(pd.Series.cumsum)
fullDT_joined['deaths'] = fullDT_joined.groupby('fips')['Death Count'].transform(pd.Series.cumsum)

fullDT_joined = fullDT_joined[['date_x','fips','county','cases','deaths']]
fullDT_joined.rename(columns={'date_x':'date'}, inplace=True)

fullDT_joined.to_csv('../app/ohio/data/archive/'+str(today)+'-'+state+'-export.csv',index=False)

In [4]:
fullDT_joined.loc[fullDT_joined['date'] == most_recent_date]['cases'].sum()

14117.0

In [5]:
fullDT_joined.loc[fullDT_joined['date'] == most_recent_date]['deaths'].sum()

610.0