In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

df_usdead = df_usdead.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usdead.columns.tolist()
df_usdead = df_usdead[cols[0:1] + cols[4:6] + cols[8:10] + cols[11:12] + cols[6:8] + cols[-4:] + cols[12:-4]]

df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

In [3]:
us_conf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
us_dead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

us_totals = us_conf_pivot.merge(us_dead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [4]:
US_confirmed_daily_increase = df_usconf.copy()
US_dead_daily_increase = df_usdead.copy()

US_confirmed_daily_increase.iloc[:,12:] = US_confirmed_daily_increase.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
US_dead_daily_increase.iloc[:,12:] = US_dead_daily_increase.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

us_conf_daily = US_confirmed_daily_increase.melt(id_vars = US_confirmed_daily_increase.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
us_dead_daily = US_dead_daily_increase.melt(id_vars = US_dead_daily_increase.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = us_conf_daily.merge(us_dead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [5]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [6]:
last14 = us_daily['Date'].values.tolist()[-14:]
conf_total = us_conf_daily['Daily Confirmed Cases'].sum()
conf_dead = us_dead_daily['Daily Dead'].sum()
US_pop = 329943320 #as of 1/1/20


print('US Stats As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', us_conf_daily['Daily Confirmed Cases'].sum())
print('Confirmed Cases Percentage of US population: %.2f' %((us_conf_daily['Daily Confirmed Cases'].sum()/ 332639102) * 100),'%')
print('\n')
print('Total Deaths To Date: ', us_dead_daily['Daily Dead'].sum())
print('Confirmed Cases Percentage of US population: %.2f' %((us_dead_daily['Daily Dead'].sum()/ 332639102) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %(us_dead_daily['Daily Dead'].sum()/us_conf_daily['Daily Confirmed Cases'].sum()*100),'%')
print('\n')
print('Last 14 days:')
print('Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())
print('Cases in Last 14 days as Percentage of Total Cases: %.2f' %((us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum()/us_conf_daily['Daily Confirmed Cases'].sum())*100),'%')

US Stats As of:  07/14/20
Total Confirmed Cases To Date:  3431573
Confirmed Cases Percentage of US population: 1.03 %


Total Deaths To Date:  136466
Confirmed Cases Percentage of US population: 0.04 %
Percentage of deaths from confirmed cases : 3.98 %


Last 14 days:
Total Confirmed Cases:  795160
Cases in Last 14 days as Percentage of Total Cases: 23.17 %


In [27]:
#Top daily increases by county
us_daily.sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:5]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
327972,New York,New York,04/15/20,7837,561
36949,Los Angeles,California,07/05/20,7198,30
327961,New York,New York,04/04/20,6147,696
327968,New York,New York,04/11/20,5924,714
327957,New York,New York,03/31/20,5666,469


In [52]:
#Top daily increases by state
stategrp = us_daily.groupby(['Province_State','Date'], as_index=False)['Daily Confirmed Cases','Daily Dead'].sum()
stategrp.sort_values(by='Daily Confirmed Cases', ascending = False)[0:10]

Unnamed: 0,Province_State,Date,Daily Confirmed Cases,Daily Dead
2108,Florida,07/12/20,15300,45
1047,California,07/07/20,12977,132
1054,California,07/14/20,12854,161
2109,Florida,07/13/20,12624,35
1045,California,07/05/20,11786,39
8793,Texas,07/09/20,11612,131
2100,Florida,07/04/20,11458,18
6420,New York,04/15/20,11434,786
2106,Florida,07/10/20,11433,93
6413,New York,04/08/20,11186,985


In [58]:
stategrp.loc[stategrp['Province_State']=='California'].tail(15)

Unnamed: 0,Province_State,Date,Daily Confirmed Cases,Daily Dead
1041,California,07/01/20,7263,87
1042,California,07/02/20,7869,96
1043,California,07/03/20,3964,50
1044,California,07/04/20,2381,19
1045,California,07/05/20,11786,39
1046,California,07/06/20,6354,68
1047,California,07/07/20,12977,132
1048,California,07/08/20,8548,145
1049,California,07/09/20,9924,141
1050,California,07/10/20,8401,96


# APPENDIX

In [104]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

Unnamed: 0,UID,CBSA Code,CBSA Title,CSA Code,CSA Title
1288,84072001,38660,"Ponce, PR",434.0,"Ponce-Yauco-Coamo, PR"
1289,84072003,10380,"Aguadilla-Isabela, PR",,
1290,84072005,10380,"Aguadilla-Isabela, PR",,
1291,84072007,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1292,84072009,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1293,84072011,10380,"Aguadilla-Isabela, PR",,
1294,84072013,11640,"Arecibo, PR",490.0,"San Juan-Bayamón, PR"
1295,84072015,25020,"Guayama, PR",490.0,"San Juan-Bayamón, PR"
1296,84072017,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1297,84072019,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"


In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [115]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''