In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from datetime import timedelta

pd.options.display.max_columns = None

#data dictionaries
import state_abbreviations
import statepop_2019est

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

url3 = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url3,error_bad_lines = False)
df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})
df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']] = df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']].fillna(0)

In [3]:
to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    """Checks whether the daily increments column adds up to the cumulative column for testing, positive, and negative results.
    
    If the columns are not equal, ID's the earliest date which the numbers are misaligned and fills the incremental results with the cumulative results.
    After review, the testing df is only missing the first daily increments.
    Still might want to add a check in case filling the earliest date is not enough to equate the two columns."""
    for state in states:
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])] 

In [4]:
fill_func(df_testing['Province_State'].unique())

#Update state names
df_testing = df_testing.replace({'Province_State':state_abbreviations.state_abbrev})

#add state populations
df_testing['Population'] = df_testing['Province_State'].map(statepop_2019est.statepop_2019est)

df_testing['Date'] = pd.to_datetime(df_testing['Date'].astype(dtype = 'str'))

In [5]:
ca_testing = df_testing[['Date','Province_State','positive','negative','hospitalizedCurrently','inIcuCurrently', 'death', 'positiveIncrease','negativeIncrease','totalTestResults','totalTestResultsIncrease','deathIncrease']].loc[df_testing['Province_State'] =='California'].copy().reset_index(drop=True)
ca_testing['Province_State'] = 'California'
ca_testing = ca_testing.sort_values(by = ['Province_State','Date']).reset_index(drop=True)

us_testing = df_testing[['Province_State','Date','Population','positiveIncrease', 'totalTestResultsIncrease','totalTestResults','positive','negative']].copy()

In [6]:
#Feature creation
us_testing['Tests per 1000 Capita'] = us_testing['totalTestResults']/us_testing['Population']*1000
us_testing['Total Positivity Rate'] = us_testing['positive']/us_testing['totalTestResults']*100

ranks = []
for date in us_testing['Date'].unique():
    for ranking in us_testing.copy().loc[us_testing['Date']==date,'Tests per 1000 Capita'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Tests per Capita Daily Ranking': ranks})
us_testing = pd.concat([us_testing, rankings], axis=1)

In [7]:
check = 0
for state in df_testing['Province_State'].unique():
    for col in range(0,len(to_fix)):
        cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
        actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
        comparison = cumsum.eq(actualsum)
        if comparison[comparison == False].count() > 0:
            print(state, to_fix[col], actual[col])
            check += 1
if check == 0:
    print('All States checked')

All States checked


In [8]:
ca_testing.to_csv(r'X:\AC\Documents\Datasets\CA_daily_pivot.csv', index=False)
us_testing.to_csv(r'X:\AC\Documents\Datasets\US_testing_daily.csv', index=False)

In [9]:
us_testing.head()

Unnamed: 0,Province_State,Date,Population,positiveIncrease,totalTestResultsIncrease,totalTestResults,positive,negative,Tests per 1000 Capita,Total Positivity Rate,Tests per Capita Daily Ranking
0,Alaska,2020-09-07,731545,34.0,1564.0,388621.0,6690.0,381931.0,531.233212,1.721472,2.0
1,Alabama,2020-09-07,4903185,659.0,3687.0,987995.0,132973.0,855022.0,201.500657,13.458874,36.0
2,Arkansas,2020-09-07,3017804,1054.0,6866.0,772813.0,66280.0,706533.0,256.084557,8.57646,18.0
3,American Samoa,2020-09-07,55312,0.0,0.0,1514.0,0.0,1514.0,27.371999,0.0,56.0
4,Arizona,2020-09-07,7278717,198.0,4379.0,1263397.0,205964.0,1057433.0,173.574134,16.302397,45.0


# Data Exploration

In [10]:
most_recent = df_testing.loc[(df_testing['Province_State']=='California')]['Date'].max()
last14 = [most_recent - timedelta(days=x) for x in range(14)]
most_recent = df_testing.loc[(df_testing['Province_State']=='California') & (df_testing['Date']==most_recent)]
last14 = df_testing.loc[(df_testing['Province_State']=='California') & (df_testing['Date'].isin(last14))]

current_hosp = most_recent['hospitalizedCurrently'].iloc[0].astype(int)
perc_pos = (most_recent['positive']/most_recent['totalTestResults']).iloc[0] * 100
total_tests = most_recent['totalTestResults'].iloc[0]
population = statepop_2019est.statepop_2019est['California']
testsper1000 = total_tests/population*1000

last14tests = last14['totalTestResultsIncrease'].sum()
last14perc_pos = last14['positiveIncrease'].sum()/last14tests * 100

print('California Stats as of: ', pd.datetime.strftime(most_recent['Date'].max(), '%m/%d/%y'))
print('Total population in CA: %d' %population)
print('\n')
print('Total tests conducted: %d' %total_tests)
print('Total tests conducted per 1000 people in CA: %.2f' %testsper1000)
print('Total cases per 1000 people in CA: %.2f' %(most_recent['positive']/population*1000))
print('Percentage of total tests confirmed positive: %.2f' %perc_pos,'%')
print('Percentage of total tests confirmed negative: %.2f' %(100 - perc_pos),'%')
print(f'Currently hospitalized: {current_hosp}')
print('\n')
print('Last 14 days:')
print('Total tests conducted in last 14 days: %d' %last14tests)
print('Percentage of tests in last 14 days confirmed positive: %.2f' %last14perc_pos,'%')
print('Tests conducted in last 14 days as percentage of totals tests conducted: %d' %(last14tests/total_tests*100), '%')
print('Positives in last 14 days as percentage of total positives: %.2f' %(last14['positiveIncrease'].sum()/most_recent['positive']*100),'%')

California Stats as of:  09/07/20
Total population in CA: 39512223


Total tests conducted: 12158292
Total tests conducted per 1000 people in CA: 307.71
Total cases per 1000 people in CA: 18.61
Percentage of total tests confirmed positive: 6.05 %
Percentage of total tests confirmed negative: 93.95 %
Currently hospitalized: 4285


Last 14 days:
Total tests conducted in last 14 days: 1505805
Percentage of tests in last 14 days confirmed positive: 4.42 %
Tests conducted in last 14 days as percentage of totals tests conducted: 12 %
Positives in last 14 days as percentage of total positives: 9.06 %


In [11]:
CA_greatest_daily = us_testing.loc[us_testing['Province_State']=='California'].sort_values('positiveIncrease', ascending = False).iloc[0]

print('Highest Daily Increase for California on: ', pd.datetime.strftime(CA_greatest_daily['Date'], '%m/%d/%y'))
print('Positive cases: %d' %(CA_greatest_daily['positiveIncrease']))
print('Positivity rate: %.2f' %(CA_greatest_daily['positiveIncrease']/CA_greatest_daily['totalTestResultsIncrease']*100),'%')

Highest Daily Increase for California on:  07/22/20
Positive cases: 12807
Positivity rate: 10.05 %


In [12]:
us_testing.loc[us_testing['Date']==us_testing['Date'].max()].sort_values('Population', ascending = False)

Unnamed: 0,Province_State,Date,Population,positiveIncrease,totalTestResultsIncrease,totalTestResults,positive,negative,Tests per 1000 Capita,Total Positivity Rate,Tests per Capita Daily Ranking
5,California,2020-09-07,39512223,3091.0,111101.0,12158292.0,735235.0,11423057.0,307.709642,6.04719,12.0
47,Texas,2020-09-07,28995881,2060.0,38420.0,5320999.0,640370.0,4680629.0,183.508789,12.03477,43.0
10,Florida,2020-09-07,21477737,1838.0,16914.0,4801684.0,648269.0,4153415.0,223.565639,13.500868,28.0
37,New York,2020-09-07,19453561,520.0,58865.0,8855109.0,440021.0,8415088.0,455.192188,4.96912,4.0
41,Pennsylvania,2020-09-07,12801989,547.0,9626.0,1748241.0,139863.0,1608378.0,136.560108,8.000213,53.0
16,Illinois,2020-09-07,12671821,1381.0,28975.0,4449275.0,252889.0,4196386.0,351.115676,5.683825,8.0
38,Ohio,2020-09-07,11689100,778.0,21779.0,2426014.0,131336.0,2294678.0,207.544978,5.413654,33.0
11,Georgia,2020-09-07,10617423,608.0,9273.0,2488841.0,283807.0,2205034.0,234.411024,11.403179,23.0
30,North Carolina,2020-09-07,10488084,1018.0,16632.0,2459582.0,177919.0,2281663.0,234.512042,7.233709,22.0
24,Michigan,2020-09-07,9986857,1212.0,43676.0,2962678.0,118403.0,2844275.0,296.657697,3.996486,13.0


# Forecasting

In [28]:
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA

from datetime import datetime

In [34]:
ca_cases = us_testing.loc[us_testing['Province_State']=='California', ['Date','positive']]
type(ca_cases['Date'].iloc[0])
# ca_log = np.log(ca_cases)
# plt.plot(ca_cases)

pandas._libs.tslibs.timestamps.Timestamp

# Appendix

In [None]:
us_conf_daily = df_usconf.copy()
us_dead_daily = df_usdead.copy()

us_conf_daily.iloc[:,11:] = us_conf_daily.iloc[:,11:].diff(axis=1).fillna(0).astype('int')
us_dead_daily.iloc[:,12:] = us_dead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

us_conf_daily = us_conf_daily.groupby(['Province_State'])[us_conf_daily.columns.to_list()[11:]].agg('sum').reset_index()
us_dead_daily = us_dead_daily.groupby(['Province_State'])[us_dead_daily.columns.to_list()[11:]].agg('sum').reset_index()

us_conf_daily = us_conf_daily.melt(id_vars = us_conf_daily.columns[:1], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['Province_State','Date']).reset_index(drop=True)
us_dead_daily = us_dead_daily.melt(id_vars = us_dead_daily.columns[:2], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['Province_State','Date']).reset_index(drop=True)

us_daily = us_conf_daily.merge(us_dead_daily[['Province_State','Population','Date']], on = ['Province_State','Date'], how = 'left', suffixes = (False, False))
us_daily = us_daily[['Province_State','Population','Date']]
us_daily['Date'] = pd.to_datetime(us_daily['Date'])
us_daily = us_daily.sort_values(by = ['Province_State','Date']).reset_index(drop=True)

In [None]:
#there will be discrepancies between the daily confirmed cases from JHU and from CDHP.
#Should probably use all CDHP metrics for CA analysis and only population from JHU
ca_daily = us_daily.loc[us_daily['Province_State']=='California'].copy().reset_index(drop=True)
ca_daily = ca_daily.merge(ca_testing[['Province_State', 'Date','totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative','hospitalizedCurrently','inIcuCurrently']], on = ['Province_State', 'Date'], how = 'left', suffixes = (False, False))

us_daily = df_testing[['Province_State','Date','positiveIncrease','negativeIncrease', 'totalTestResultsIncrease','totalTestResults','positive','negative']]

In [None]:
df_pop = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2019 State Population Estimates.csv', delimiter = ',', encoding = "ISO-8859-1")

In [None]:
pop_check = df_pop.merge(us_testing[['Province_State','Population']], how='inner', left_on='State Name', right_on='Province_State')

In [None]:
pop_check.drop_duplicates(inplace=True)
pop_check['delta'] = pop_check['2019 Total Population Estimate'] - pop_check['Population']
pop_check.loc[pop_check['delta']!=0]

In [None]:
#apply date format and convert to string
.apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))

In [None]:
columns = test_df.columns
test_df.iloc[7694,columns.get_loc('totalTestResultsIncrease')] = 1
a = np.cumsum(test_df.loc[test_df['Province_State']=='TX'].sort_values(by='Date')['totalTestResultsIncrease'].values,dtype=int)
b = test_df.loc[test_df['Province_State']=='TX'].sort_values(by='Date')['totalTestResults'].values
np.array_equal(a,b)

In [None]:
#filling missing starting values. May want to do this as function for all states
df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date']==20200304), ['positiveIncrease','negativeIncrease','totalTestResultsIncrease']] = df_testing.loc[(df_testing['Province_State']=='CA') & (df_testing['Date']==20200304), ['positive','negative','totalTestResults']].values.tolist()

In [None]:
#Update function with secondary check
to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    for state in states:
        print(state)
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            print(cumsum.iloc[0], actualsum.iloc[0], comparison.iloc[0])
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])]
                print('Fixed in first: ', state,to_fix[col],actual[col])
            else:
                print('Arrays are equal')
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() == 0:
                print('Confirmed arrays are fixed')
            else:
                print('Failed check: ', state,to_fix[col],actual[col] ,'\n')

In [None]:
#['Province_State', 'Date','totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative','hospitalizedCurrently','inIcuCurrently']
#hospitalized currently missing values from american samoa, Hawaii, Kansas, northern marinas, virginia
#inincu currently missing values from 28 states/provinces
df_testing.loc[df_testing['Date']=='2020-07-21',['Province_State', 'Date','totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative','hospitalizedCurrently','inIcuCurrently']]

In [None]:
cumsum = df_testing.loc[df_testing['Province_State']=='NV'].sort_values(by='Date')['totalTestResultsIncrease'].cumsum()
actualsum = df_testing.loc[df_testing['Province_State']=='NV',['Date','totalTestResults']].sort_values(by='Date')['totalTestResults']
check = cumsum.eq(actualsum)
check[check==False].sum()

In [None]:
#Feature creation, rankings
#Difference between daily rankings
dates = stategrp['Date'].unique().tolist()
ustemp = pd.DataFrame()
for state in stategrp['Province_State'].unique():
    statetemp = stategrp.loc[stategrp['Province_State']==state].copy().reset_index(drop=True)
    yest = pd.Series(statetemp.loc[statetemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    statetemp = pd.concat([statetemp,yest], axis=1)
    ustemp = pd.concat([ustemp, statetemp])

In [4]:
state_abbrev = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [5]:
statepop_2019est = {
        'Alaska': 731545,
        'Alabama': 4903185,
        'Arkansas': 3017804,
        'American Samoa': 55312,
        'Arizona': 7278717,
        'California': 39512223,
        'Colorado': 5758736,
        'Connecticut': 3565287,
        'District of Columbia': 705749,
        'Delaware': 973764,
        'Florida': 21477737,
        'Georgia': 10617423,
        'Guam': 167294,
        'Hawaii': 1415872,
        'Iowa': 3155070,
        'Idaho': 1787065,
        'Illinois': 12671821,
        'Indiana': 6732219,
        'Kansas': 2913314,
        'Kentucky': 4467673,
        'Louisiana': 4648794,
        'Massachusetts': 6892503,
        'Maryland': 6045680,
        'Maine': 1344212,
        'Michigan': 9986857,
        'Minnesota': 5639632,
        'Missouri': 6137428,
        'Northern Mariana Islands': 57216,
        'Mississippi': 2976149,
        'Montana': 1068778,
        'North Carolina': 10488084,
        'North Dakota': 762062,
        'Nebraska': 1934408,
        'New Hampshire': 1359711,
        'New Jersey': 8882190,
        'New Mexico': 2096829,
        'Nevada': 3080156,
        'New York': 19453561,
        'Ohio': 11689100,
        'Oklahoma': 3956971,
        'Oregon': 4217737,
        'Pennsylvania': 12801989,
        'Puerto Rico': 2933408,
        'Rhode Island': 1059361,
        'South Carolina': 5148714,
        'South Dakota': 884659,
        'Tennessee': 6829174,
        'Texas': 28995881,
        'Utah': 3205958,
        'Virginia': 8535519,
        'Virgin Islands': 30030,
        'Vermont': 623989,
        'Washington': 7614893,
        'Wisconsin': 5822434,
        'West Virginia': 1792147,
        'Wyoming': 578759}