In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from datetime import timedelta

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

In [3]:
url = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url,error_bad_lines = False)
df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})

#filling missing starting values. May want to do this as function for all states
df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date']==20200304), ['positiveIncrease','negativeIncrease','totalTestResultsIncrease']] = df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date']==20200304), ['positive','negative','totalTestResults']].values.tolist()

df_testing['Date'] = pd.to_datetime(df_testing['Date'].astype(dtype = 'str'))

ca_testing = df_testing[['Date','Province_State','positive','negative','hospitalizedCurrently','inIcuCurrently', 'death', 'positiveIncrease','negativeIncrease','totalTestResults','totalTestResultsIncrease','deathIncrease']].loc[df_testing['Province_State'] =='CA'].copy().reset_index(drop=True)
ca_testing['Province_State'] = 'California'
ca_testing = ca_testing.sort_values(by = ['Province_State','Date']).reset_index(drop=True)

In [4]:
us_conf_daily = df_usconf.copy()
us_dead_daily = df_usdead.copy()

us_conf_daily.iloc[:,11:] = us_conf_daily.iloc[:,11:].diff(axis=1).fillna(0).astype('int')
us_dead_daily.iloc[:,12:] = us_dead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

us_conf_daily = us_conf_daily.groupby(['Province_State'])[us_conf_daily.columns.to_list()[11:]].agg('sum').reset_index()
us_dead_daily = us_dead_daily.groupby(['Province_State'])[us_dead_daily.columns.to_list()[11:]].agg('sum').reset_index()

us_conf_daily = us_conf_daily.melt(id_vars = us_conf_daily.columns[:1], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['Province_State','Date']).reset_index(drop=True)
us_dead_daily = us_dead_daily.melt(id_vars = us_dead_daily.columns[:2], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['Province_State','Date']).reset_index(drop=True)

us_daily = us_conf_daily.merge(us_dead_daily[['Province_State','Population','Date']], on = ['Province_State','Date'], how = 'left', suffixes = (False, False))
us_daily = us_daily[['Province_State','Population','Date']]
us_daily['Date'] = pd.to_datetime(us_daily['Date'])
us_daily = us_daily.sort_values(by = ['Province_State','Date']).reset_index(drop=True)

In [5]:
#there will be discrepancies between the daily confirmed cases from JHU and from CDHP.
#Should probably use all CDHP metrics for CA analysis and only population from JHU
ca_daily = us_daily.loc[us_daily['Province_State']=='California'].copy().reset_index(drop=True)
ca_daily = ca_daily.merge(ca_testing[['Province_State', 'Date','totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative','hospitalizedCurrently','inIcuCurrently']], on = ['Province_State', 'Date'], how = 'left', suffixes = (False, False))

In [9]:
ca_daily.to_csv(r'X:\AC\Documents\Datasets\CA_daily_pivot.csv')

# Data Exploration

In [14]:
ca_daily[70:100]

Unnamed: 0,Province_State,Population,Date,totalTestResultsIncrease,positiveIncrease,negativeIncrease,totalTestResults,positive,negative,hospitalizedCurrently,inIcuCurrently
70,California,39512223,2020-04-01,673.0,673.0,0.0,29927.0,8155.0,21772.0,1855.0,774.0
71,California,39512223,2020-04-02,3073.0,1036.0,2037.0,33000.0,9191.0,23809.0,1922.0,816.0
72,California,39512223,2020-04-03,2300.0,1510.0,790.0,35300.0,10701.0,24599.0,2188.0,901.0
73,California,39512223,2020-04-04,78400.0,1325.0,77075.0,113700.0,12026.0,101674.0,2300.0,1008.0
74,California,39512223,2020-04-05,2833.0,1412.0,1421.0,116533.0,13438.0,103095.0,2398.0,1040.0
75,California,39512223,2020-04-06,898.0,898.0,0.0,117431.0,14336.0,103095.0,2509.0,1085.0
76,California,39512223,2020-04-07,13798.0,1529.0,12269.0,131229.0,15865.0,115364.0,2611.0,1108.0
77,California,39512223,2020-04-08,13035.0,1092.0,11943.0,144264.0,16957.0,127307.0,2714.0,1154.0
78,California,39512223,2020-04-09,19236.0,1352.0,17884.0,163500.0,18309.0,145191.0,2825.0,1132.0
79,California,39512223,2020-04-10,1363.0,1163.0,200.0,164863.0,19472.0,145391.0,2897.0,1145.0


In [21]:
ca_daily['totalTestResultsIncrease'].isna().sum()

42

In [6]:
most_recent = df_testing.loc[(df_testing['Province_State']=='CA')]['Date'].max()
last14 = [most_recent - timedelta(days=x) for x in range(14)]
most_recent = df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date']==most_recent)]
last14 = df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date'].isin(last14))]

current_hosp = most_recent['hospitalizedCurrently'].iloc[0].astype(int)
perc_pos = (most_recent['positive']/most_recent['totalTestResults']).iloc[0] * 100
total_tests = most_recent['totalTestResults'].iloc[0]
population = ca_daily['Population'][0]
testsper1000 = total_tests/population*1000

last14tests = last14['totalTestResultsIncrease'].sum()
last14perc_pos = last14['positiveIncrease'].sum()/last14tests * 100

print('California Stats as of: ', pd.datetime.strftime(most_recent['Date'].max(), '%m/%d/%y'))
print('Total population in CA: %d' %population)
print('\n')
print('Total tests conducted: %d' %total_tests)
print('Total tests conducted per 1000 people in CA: %.2f' %testsper1000)
print('Percentage of total tests confirmed positive: %.2f' %perc_pos,'%')
print('Percentage of total tests confirmed negative: %.2f' %(100 - perc_pos),'%')
print(f'Currently hospitalized: {current_hosp}')
print('\n')
print('Last 14 days:')
print('Total tests conducted in last 14 days: %d' %last14tests)
print('Percentage of tests in last 14 days confirmed positive: %.2f' %last14perc_pos,'%')

California Stats as of:  07/14/20
Total population in CA: 39512223


Total tests conducted: 5674955
Total tests conducted per 1000 people in CA: 143.63
Percentage of total tests confirmed positive: 5.93 %
Percentage of total tests confirmed negative: 94.07 %
Currently hospitalized: 8145


Last 14 days:
Total tests conducted in last 14 days: 1507816
Percentage of tests in last 14 days confirmed positive: 7.53 %


# Appendix

In [None]:
#apply date format and convert to string
.apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))