In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [3]:
#Add CBSA/CSA titles and codes to df
df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

#convert date columns into a single column
df_usconf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

#df for daily increments
df_usconf_daily = df_usconf.copy()
df_usdead_daily = df_usdead.copy()
df_usconf_daily.iloc[:,12:] = df_usconf_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
df_usdead_daily.iloc[:,12:] = df_usdead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

#convert date columns into a single column
df_usconf_daily = df_usconf_daily.melt(id_vars = df_usconf_daily.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_daily = df_usdead_daily.melt(id_vars = df_usdead_daily.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

In [4]:
us_totals = df_usconf_pivot.merge(df_usdead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = df_usconf_daily.merge(df_usdead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [5]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [6]:
last14 = us_daily['Date'].values.tolist()[-14:]
conf_total = us_daily['Daily Confirmed Cases'].sum()
conf_dead = us_daily['Daily Dead'].sum()
US_pop = 329943320 #as of 1/1/20


print('US Stats As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', conf_total)
print('Confirmed Cases Percentage of US population: %.2f' %((conf_total/ US_pop) * 100),'%')
print('\n')
print('Total Deaths To Date: ', (conf_dead))
print('Confirmed Cases Percentage of US population: %.2f' %((conf_dead/ US_pop) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %((conf_dead/ US_pop)/(conf_total/ US_pop)*100),'%')
print('\n')
print('Last 14 days:')
print('Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())
print('Cases in Last 14 days as Percentage of Total Cases: %.2f' %((us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum()/(conf_total))*100),'%')

US Stats As of:  07/28/20
Total Confirmed Cases To Date:  4351996
Confirmed Cases Percentage of US population: 1.32 %


Total Deaths To Date:  149256
Confirmed Cases Percentage of US population: 0.05 %
Percentage of deaths from confirmed cases : 3.43 %


Last 14 days:
Total Confirmed Cases:  920423
Cases in Last 14 days as Percentage of Total Cases: 21.15 %


In [7]:
#Top daily increases by county
us_daily.sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:5]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
367122,New York,New York,04/15/20,7837,561
54597,Los Angeles,California,07/05/20,7198,30
367111,New York,New York,04/04/20,6147,696
495545,Bexar,Texas,07/16/20,5980,28
367118,New York,New York,04/11/20,5924,714


In [8]:
#Top daily increases by state
stategrp = us_daily.groupby(['Province_State','Date'], as_index=False)['Daily Confirmed Cases','Daily Dead'].sum()
stategrp.sort_values(by='Daily Confirmed Cases', ascending = False)[0:10]

Unnamed: 0,Province_State,Date,Daily Confirmed Cases,Daily Dead
2251,Florida,07/12/20,15300,45
9437,Texas,07/16/20,14962,151
2255,Florida,07/16/20,13965,156
1112,California,07/07/20,12977,132
1119,California,07/14/20,12854,161
1133,California,07/28/20,12641,185
2252,Florida,07/13/20,12624,35
9443,Texas,07/22/20,12544,240
2258,Florida,07/19/20,12478,87
2263,Florida,07/24/20,12444,135


In [16]:
stategrp.loc[stategrp['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[0:9]

Unnamed: 0,Province_State,Date,Daily Confirmed Cases,Daily Dead
1112,California,07/07/20,12977,132
1119,California,07/14/20,12854,161
1133,California,07/28/20,12641,185
1127,California,07/22/20,11981,159
1110,California,07/05/20,11786,39
1126,California,07/21/20,11435,118
1125,California,07/20/20,10964,55
1114,California,07/09/20,9924,141
1121,California,07/16/20,9821,114


In [17]:
#Top 10 daily increases in CA counties
us_daily.loc[us_daily['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:9]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
54597,Los Angeles,California,07/05/20,7198,30
54608,Los Angeles,California,07/16/20,4471,46
54606,Los Angeles,California,07/14/20,4219,73
54599,Los Angeles,California,07/07/20,4194,48
54617,Los Angeles,California,07/25/20,3390,51
54604,Los Angeles,California,07/12/20,3155,14
54612,Los Angeles,California,07/20/20,3128,8
54614,Los Angeles,California,07/22/20,3109,60
54591,Los Angeles,California,06/29/20,3040,26


# APPENDIX

In [None]:
url3 = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url3,error_bad_lines = False)

df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})
df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']] = df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']].fillna(0)

to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    for state in states:
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])]
                
fill_func(df_testing['Province_State'].unique())
df_testing = df_testing.replace({'Province_State':states})
df_testing['Date'] = pd.to_datetime(df_testing['Date'], format='%Y%m%d').apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))



In [None]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [None]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''

In [None]:
#California and Texas total confirmed cases differ by 2000+. Stick to COVIDTESTING data for testing dataframe
totalgrp = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
comparison = df_testing.loc[df_testing['Date']=='07/27/20', ['Province_State','Date','positive']].merge(totalgrp.loc[totalgrp['Date']=='07/27/20',['Province_State','Date','Total Confirmed Cases']], on=['Province_State','Date'], how = 'left', suffixes = (False, False))
comparison['Delta'] = comparison['positive'] - comparison['Total Confirmed Cases']
comparison