In [1]:
import pandas as pd
from datetime import date, timedelta
import json

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
death_words = ['DEATH','DEAD','DIED']

def clean_outcome(row):
    if row['outcome'] in death_words:
        return 'DEATH'
    else:
        return 'POSITIVE'

In [3]:
cnty_join = pd.read_csv('../data/tl_2019_us_county.csv',dtype={'STATEFP':str,'COUNTYFP':str,'GEOID':str})
cnty_join = cnty_join[['STATEFP','COUNTYFP','GEOID','NAMELSAD']]

state = 'California'
state_fips = '06'
today = str(date.today())
print("Today's date:", today)

case_data = pd.read_csv('../data/case-data/outside_Hubei.data.19032020T011105.csv')
us_cases = case_data.loc[case_data['country'] == 'United States']
state_cases = us_cases.loc[us_cases['province'] == state]
state_cases['date_confirm_clean'] = pd.to_datetime(state_cases['date_confirmation'], format='%d.%m.%Y', errors='ignore')

state_cases['outcome'] = state_cases['outcome'].str.upper()
state_cases['outcome_clean'] = state_cases.apply(lambda row: clean_outcome(row), axis=1)

#join county fips
cnty_join = cnty_join.loc[cnty_join['STATEFP'] == state_fips]
cnty_join = cnty_join[['NAMELSAD','COUNTYFP']]
cnty_join_dict = dict(zip(cnty_join.NAMELSAD, cnty_join.COUNTYFP))     
state_cases['fips'] = state_cases['city'].map(cnty_join_dict)

#pivot on county, agg deaths positive cases
by_county = pd.pivot_table(state_cases, values='ID', index=['province','fips','city','date_confirm_clean'],
                           columns=['outcome_clean'], aggfunc='count')

print('Total cases reported for',state,':',len(state_cases))
print('Total deaths reported for',state,':',len(state_cases.loc[state_cases['outcome_clean'] == 'DEATH']))
print('Most recent case confirmation date:',state_cases.date_confirm_clean.max())
display(by_county)

Today's date: 2020-03-22
Total cases reported for California : 471
Total deaths reported for California : 0
Most recent case confirmation date: 2020-03-16 00:00:00


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_clean,POSITIVE
province,fips,city,date_confirm_clean,Unnamed: 4_level_1
California,1,Alameda County,2020-03-01,1
California,1,Alameda County,2020-03-06,1
California,1,Alameda County,2020-03-10,1
California,1,Alameda County,2020-03-12,2
California,1,Alameda County,2020-03-15,11
California,1,Alameda County,2020-03-16,3
California,9,Calaveras County,2020-03-10,2
California,13,Contra Costa County,2020-03-03,1
California,13,Contra Costa County,2020-03-06,6
California,13,Contra Costa County,2020-03-08,5


In [10]:
state_cases.head()

Unnamed: 0,ID,age,sex,city,province,country,wuhan(0)_not_wuhan(1),latitude,longitude,geo_resolution,date_onset_symptoms,date_admission_hospital,date_confirmation,symptoms,lives_in_Wuhan,travel_history_dates,travel_history_location,reported_market_exposure,additional_information,chronic_disease_binary,chronic_disease,source,sequence_available,outcome,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,date_confirm_clean,outcome_clean,fips
644,677,50.0,male,Orange County,California,United States,1.0,33.70329,-117.761,admin2,,,26.01.2020,,,,Wuhan,,,,,https://www.cnbc.com/2020/01/26/third-us-case-...,,,,https://laist.com/2020/01/26/coronavirus-orang...,,,Orange County,California,United States,213,,2020-01-26,POSITIVE,59.0
1032,1065,,,Los Angeles,California,United States,1.0,34.05,-118.25,point,22.01.2020,22.01.2020,26.01.2020,,yes,,Wuhan,,"From Wuhan, went through LAX to a vacation (po...",,,https://www.latimes.com/california/story/2020-...,,,,Most details from listening to LA County Publi...,Los Angeles,,,California,United States,5,,2020-01-26,POSITIVE,
4750,4873,,male,Santa Clara County,California,United States,1.0,37.23166,-121.693,admin2,,,01.02.2020,,,24.01.2020,Wuhan,,,,,https://abc7news.com/health/bay-areas-1st-case...,,,,,,,Santa Clara County,California,United States,226,,2020-02-01,POSITIVE,85.0
4781,4904,,female,Santa Clara County,California,United States,1.0,37.23166,-121.693,admin2,,,02.02.2020,,,,Wuhan,,,,,https://www.cbsnews.com/live-updates/coronavir...,,,,,,,Santa Clara County,California,United States,226,,2020-02-02,POSITIVE,85.0
5306,5432,57.0,female,San Benito County,California,United States,1.0,36.6064,-121.074,admin2,,,02.02.2020,,no,,none,,husband and wife recent travle to china,,,https://bnonews.com/wp-content/uploads/2020/02...,,,,,,,San Benito County,California,United States,218,,2020-02-02,POSITIVE,69.0


## County + day dataset

In addition to the above cases per county per day, we need to record county + days that have no cases or deaths so we can get a good time series.

In [13]:
covid_start = date(2020, 1, 21)   # start date
today = date.today()   # end date
delta = today - covid_start       # as timedelta

days = []
for i in range(delta.days + 1):
    day = covid_start + timedelta(days=i)
    days.append(day)
    
fullDateTime = pd.DataFrame(columns=['date','fips','county'])
for day in days:
    for index, row in cnty_join.iterrows():
        fullDateTime = fullDateTime.append({'date': day, 
                                            'fips': row['COUNTYFP'], 
                                            'county': row['NAMELSAD']}, ignore_index=True)
        
fullDateTime['date_confirm_clean'] = pd.to_datetime(fullDateTime['date'], format='%Y-%m-%d', errors='ignore')
fullDateTime['date_str'] = fullDateTime['date_confirm_clean'].dt.strftime('%Y-%m-%d')
fullDateTime['join_field'] = fullDateTime['date_str'] + '-' + fullDateTime['fips']

by_county_min = by_county.reset_index()
by_county_min['date_str'] = by_county_min['date_confirm_clean'].dt.strftime('%Y-%m-%d')
by_county_min['join_field'] = by_county_min['date_str'] + '-' + by_county_min['fips']

if 'DEATH' in by_county_min.columns:
    keep_cols = ['date', 'fips_x', 'county','POSITIVE','DEATH']
else:
    keep_cols = ['date', 'fips_x', 'county','POSITIVE']
    
fullDT_joined = fullDateTime.merge(by_county_min,how='left',on='join_field')
fullDT_joined = fullDT_joined.fillna(0)
fullDT_joined = fullDT_joined[keep_cols]

fullDT_joined.to_csv('../app/assets/data/'+str(today)+'-'+state+'-export.csv',index=False)

In [None]:
fullDT_joined.loc[fullDT_joined['POSITIVE']>0]

In [14]:
fullDT_joined.POSITIVE.sum()

438.0