In [1]:
import pandas as pd
from datetime import date, timedelta
import json

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [13]:
#these are the variables you change depending on your data
state = 'California'
state_fips = '06'
state_data_file = '../data/case-data/20200325-CA-cases-raw.csv'
data_date_format = '%m/%d/%Y'

today = str(date.today())
print("Today's date:", today)

cnty_join = pd.read_csv('../data/tl_2019_us_county.csv',dtype={'STATEFP':str,'COUNTYFP':str,'GEOID':str})
cnty_join = cnty_join.loc[cnty_join['STATEFP'] == state_fips]
cnty_join = cnty_join[['STATEFP','COUNTYFP','GEOID','NAMELSAD']]

case_data = pd.read_csv(state_data_file,dtype={'fips':str})
case_data['date'] = pd.to_datetime(case_data['date'], format=data_date_format, errors='ignore')

cases_by_day = case_data.groupby('date').agg({'cases':'sum','deaths':'sum'}).reset_index()
most_recent = cases_by_day.sort_values('date',ascending=False).iloc[0]

print('Total cases reported for',state,':',most_recent['cases'])
print('Total deaths reported for',state,':',most_recent['deaths'])
print('Most recent case confirmation date:',most_recent['date'])
##case_data.head()

Today's date: 2020-03-25
Total cases reported for California : 0.0
Total deaths reported for California : 0.0
Most recent case confirmation date: 2020-03-25 00:00:00


## County + day dataset

In addition to the above cases per county per day, we need to record county + days that have no cases or deaths so we can get a good time series.

In [14]:
covid_start = date(2020, 1, 21)   # start date
today = date.today()   # end date
delta = today - covid_start       # as timedelta
delta

days = []
for i in range(delta.days + 1):
    day = covid_start + timedelta(days=i)
    days.append(day)
    
fullDateTime = pd.DataFrame(columns=['date','fips','county'])
for day in days:
    for index, row in cnty_join.iterrows():
        fullDateTime = fullDateTime.append({'date': day, 
                                            'fips': row['GEOID'], 
                                            'county': row['NAMELSAD']}, ignore_index=True)

fullDateTime['date'] = pd.to_datetime(fullDateTime['date'], format='%Y-%m-%d', errors='ignore')
fullDateTime['date_str'] = fullDateTime['date'].dt.strftime('%Y-%m-%d')
fullDateTime['join_field'] = fullDateTime['date_str'] + '-' + fullDateTime['fips']

by_county = case_data
by_county['date_str'] = by_county['date'].dt.strftime('%Y-%m-%d')
by_county['join_field'] = by_county['date_str'] + '-' + by_county['county_fips']
    
fullDT_joined = fullDateTime.merge(by_county,how='left',on='join_field')
fullDT_joined = fullDT_joined.fillna(0)

fullDT_joined = fullDT_joined[['date_x','fips','county_x','cases','deaths',
                              'local_pub_health_url']]
fullDT_joined.rename(columns={'date_x':'date','county_x':'county'}, inplace=True)

fullDT_joined.to_csv('../app/assets/data/'+str(today)+'-'+state+'-export.csv',index=False)

In [17]:
by_date = fullDT_joined.groupby('date').agg({'cases':'sum','deaths':'sum'})
by_date

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-21,0.0,0.0
2020-01-22,0.0,0.0
2020-01-23,0.0,0.0
2020-01-24,0.0,0.0
2020-01-25,0.0,0.0
2020-01-26,2.0,0.0
2020-01-27,2.0,0.0
2020-01-28,2.0,0.0
2020-01-29,2.0,0.0
2020-01-30,2.0,0.0


In [15]:
fullDT_joined.loc[fullDT_joined['cases']>0]

Unnamed: 0,date,fips,county,cases,deaths,local_pub_health_url
295,2020-01-26,6037,Los Angeles County,1.0,0.0,0
337,2020-01-26,6059,Orange County,1.0,0.0,0
353,2020-01-27,6037,Los Angeles County,1.0,0.0,0
395,2020-01-27,6059,Orange County,1.0,0.0,0
411,2020-01-28,6037,Los Angeles County,1.0,0.0,0
453,2020-01-28,6059,Orange County,1.0,0.0,0
469,2020-01-29,6037,Los Angeles County,1.0,0.0,0
511,2020-01-29,6059,Orange County,1.0,0.0,0
527,2020-01-30,6037,Los Angeles County,1.0,0.0,0
569,2020-01-30,6059,Orange County,1.0,0.0,0


In [None]:
fullDT_joined.POSITIVE.sum()