In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [91]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

df_usdead = df_usdead.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usdead.columns.tolist()
df_usdead = df_usdead[cols[0:1] + cols[4:6] + cols[8:10] + cols[11:12] + cols[6:8] + cols[-4:] + cols[12:-4]]

df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

In [92]:
us_conf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
us_dead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

us_totals = us_conf_pivot.merge(us_dead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [93]:
US_confirmed_daily_increase = df_usconf.copy()
US_dead_daily_increase = df_usdead.copy()

US_confirmed_daily_increase.iloc[:,12:] = US_confirmed_daily_increase.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
US_dead_daily_increase.iloc[:,12:] = US_dead_daily_increase.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

us_conf_daily = US_confirmed_daily_increase.melt(id_vars = US_confirmed_daily_increase.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
us_dead_daily = US_dead_daily_increase.melt(id_vars = US_dead_daily_increase.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = us_conf_daily.merge(us_dead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [94]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [95]:
conf_total = us_conf_daily['Daily Confirmed Cases'].sum()
conf_dead = us_dead_daily['Daily Dead'].sum()
US_pop = 332639102


print('As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', us_conf_daily['Daily Confirmed Cases'].sum())
print('Confirmed Cases Percentage of US population: %.2f' %((us_conf_daily['Daily Confirmed Cases'].sum()/ 332639102) * 100),'%')
print('\n')
print('Total Deaths To Date: ', us_dead_daily['Daily Dead'].sum())
print('Confirmed Cases Percentage of US population: %.2f' %((us_dead_daily['Daily Dead'].sum()/ 332639102) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %(us_dead_daily['Daily Dead'].sum()/us_conf_daily['Daily Confirmed Cases'].sum()*100),'%')


As of:  07/13/20
Total Confirmed Cases To Date:  3363055
Confirmed Cases Percentage of US population: 1.01 %


Total Deaths To Date:  135605
Confirmed Cases Percentage of US population: 0.04 %
Percentage of deaths from confirmed cases : 4.03 %


In [6]:
us_conf_daily.head()

Unnamed: 0,UID,FIPS,Admin2,Lat,Long_,Province_State,Country_Region,CBSA Code,CBSA Title,CSA Code,CSA Title,Date,Daily Confirmed Cases
0,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/22/20,0
1,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/23/20,0
2,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/24/20,0
3,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/25/20,0
4,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/26/20,0


'7/1/20'

In [8]:
us_conf_daily.loc[(us_conf_daily['Province_State'] == 'Maine') & (us_conf_daily['Admin2'] == 'Washington')]

Unnamed: 0,UID,FIPS,Admin2,Lat,Long_,Province_State,Country_Region,CBSA Code,CBSA Title,CSA Code,CSA Title,Date,Daily Confirmed Cases
197340,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,1/22/20,0
197341,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,1/23/20,0
197342,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,1/24/20,0
197343,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,1/25/20,0
197344,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,1/26/20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197500,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,6/9/20,0
197501,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,7/1/20,1
197502,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,7/2/20,0
197503,84023029,23029.0,Washington,45.016071,-67.628135,Maine,US,,,,,7/3/20,0


In [7]:
us_conf_daily

Unnamed: 0,UID,FIPS,Admin2,Lat,Long_,Province_State,Country_Region,CBSA Code,CBSA Title,CSA Code,CSA Title,Date,Daily Confirmed Cases
0,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/22/20,0
1,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/23/20,0
2,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/24/20,0
3,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/25/20,0
4,16,60.0,,-14.271,-170.132,American Samoa,US,,,,,1/26/20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
521755,84099999,99999.0,,0.000,0.000,Grand Princess,US,,,,,6/5/20,0
521756,84099999,99999.0,,0.000,0.000,Grand Princess,US,,,,,6/6/20,0
521757,84099999,99999.0,,0.000,0.000,Grand Princess,US,,,,,6/7/20,0
521758,84099999,99999.0,,0.000,0.000,Grand Princess,US,,,,,6/8/20,0


In [64]:
a = us_conf_daily[(us_conf_daily['Province_State']=='New York') & (us_conf_daily['Date']=='4/25/20')]['Daily Confirmed Cases'].sum()
c = us_conf_daily[(us_conf_daily['Province_State']=='New York')]['Daily Confirmed Cases'].sum()
c

393454

In [63]:
b = us_conf_daily[(us_conf_daily['Date']=='4/25/20')]['Daily Confirmed Cases'].sum()
b

32921

In [66]:
a/c

0.02682143274690307

# APPENDIX

In [104]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

Unnamed: 0,UID,CBSA Code,CBSA Title,CSA Code,CSA Title
1288,84072001,38660,"Ponce, PR",434.0,"Ponce-Yauco-Coamo, PR"
1289,84072003,10380,"Aguadilla-Isabela, PR",,
1290,84072005,10380,"Aguadilla-Isabela, PR",,
1291,84072007,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1292,84072009,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1293,84072011,10380,"Aguadilla-Isabela, PR",,
1294,84072013,11640,"Arecibo, PR",490.0,"San Juan-Bayamón, PR"
1295,84072015,25020,"Guayama, PR",490.0,"San Juan-Bayamón, PR"
1296,84072017,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"
1297,84072019,41980,"San Juan-Bayamón-Caguas, PR",490.0,"San Juan-Bayamón, PR"


In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [115]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''