In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [3]:
#Add CBSA/CSA titles and codes to df
df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

#convert date columns into a single column
df_usconf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

#df for daily increments
df_usconf_daily = df_usconf.copy()
df_usdead_daily = df_usdead.copy()
df_usconf_daily.iloc[:,12:] = df_usconf_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
df_usdead_daily.iloc[:,12:] = df_usdead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

#convert date columns into a single column
df_usconf_daily = df_usconf_daily.melt(id_vars = df_usconf_daily.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_daily = df_usdead_daily.melt(id_vars = df_usdead_daily.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

In [4]:
us_totals = df_usconf_pivot.merge(df_usdead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = df_usconf_daily.merge(df_usdead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [5]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [6]:
last14 = us_daily['Date'].values.tolist()[-14:]
conf_total = us_daily['Daily Confirmed Cases'].sum()
conf_dead = us_daily['Daily Dead'].sum()
US_pop = 329943320 #as of 1/1/20


print('US Stats As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', conf_total)
print('Confirmed Cases Percentage of US population: %.2f' %((conf_total/ US_pop) * 100),'%')
print('\n')
print('Total Deaths To Date: ', (conf_dead))
print('Confirmed Cases Percentage of US population: %.2f' %((conf_dead/ US_pop) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %((conf_dead/ US_pop)/(conf_total/ US_pop)*100),'%')
print('\n')
print('Last 14 days:')
print('Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())
print('Cases in Last 14 days as Percentage of Total Cases: %.2f' %((us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum()/(conf_total))*100),'%')

US Stats As of:  07/29/20
Total Confirmed Cases To Date:  4426981
Confirmed Cases Percentage of US population: 1.34 %


Total Deaths To Date:  150713
Confirmed Cases Percentage of US population: 0.05 %
Percentage of deaths from confirmed cases : 3.40 %


Last 14 days:
Total Confirmed Cases:  928080
Cases in Last 14 days as Percentage of Total Cases: 20.96 %


In [178]:
statedaily = us_daily.loc[~us_daily['Province_State'].isin(['Diamond Princess','Grand Princess'])].groupby(['Province_State','Date'], as_index=False)['Population','Daily Confirmed Cases'].sum()
statedaily = statedaily.sort_values(['Date','Province_State'], ascending = [False,True]).reset_index(drop=True)
statecumsum = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
stategrp = pd.merge(statedaily, statecumsum, on = ['Province_State','Date'], how = 'left', suffixes = (False, False))
stategrp['Total Cases per 1000 capita'] = stategrp['Total Confirmed Cases']/stategrp['Population']*1000

In [179]:
ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Total Cases Daily Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

dates = stategrp['Date'].unique().tolist()
ustemp = pd.DataFrame()
for state in stategrp['Province_State'].unique():
    statetemp = stategrp.loc[stategrp['Province_State']==state].copy().reset_index(drop=True)
    yest = pd.Series(statetemp.loc[statetemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    statetemp = pd.concat([statetemp,yest], axis=1)
    ustemp = pd.concat([ustemp, statetemp])

stategrp = ustemp.sort_values(['Date','Province_State'], ascending = [False,True]).reset_index(drop=True)
stategrp['Total Cases Ranking Daily Change'] = stategrp['Total Cases Ranking Daily Change'] - stategrp['Total Cases Daily Ranking']

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Daily Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Daily Cases Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Cases per 1000 capita'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Cases per Capita Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

In [181]:
stategrp.sort_values(by=['Date','Total Cases Daily Ranking'], ascending = [False,True])[0:25]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
5,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
10,Florida,07/29/20,21477737,9446,451423,6333,21.018183,2.0,0.0,3.0,3.0
47,Texas,07/29/20,28995881,10502,418995,6193,14.450156,3.0,1.0,2.0,16.0
34,New York,07/29/20,26161672,715,413593,32658,15.80912,4.0,-1.0,24.0,12.0
32,New Jersey,07/29/20,8882190,305,180600,15798,20.332823,5.0,0.0,37.0,4.0
11,Georgia,07/29/20,10617423,3271,178323,3642,16.795318,6.0,0.0,4.0,10.0
15,Illinois,07/29/20,12671821,1395,176363,7654,13.917731,7.0,0.0,14.0,18.0
3,Arizona,07/29/20,7278717,2339,168273,3454,23.118497,8.0,0.0,5.0,2.0
35,North Carolina,07/29/20,10488084,1687,118387,1888,11.287762,9.0,0.0,11.0,25.0
23,Massachusetts,07/29/20,6892503,502,116684,8580,16.929118,10.0,0.0,30.0,9.0


In [182]:
#Top daily increases by state
stategrp.sort_values(by='Daily Confirmed Cases', ascending = False)[0:9]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
962,Florida,07/12/20,21477737,15300,269811,4242,12.562357,3.0,0.0,1.0,10.0
775,Texas,07/16/20,28995881,14962,305854,3657,10.548188,4.0,0.0,1.0,19.0
5,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
738,Florida,07/16/20,21477737,13965,315775,4677,14.702434,3.0,0.0,2.0,8.0
1237,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
845,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
61,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0
906,Florida,07/13/20,21477737,12624,282435,4277,13.150128,3.0,0.0,1.0,10.0
439,Texas,07/22/20,28995881,12544,363615,4453,12.540229,4.0,0.0,1.0,19.0


In [7]:
#Top daily increases by county
us_daily.sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:5]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
369064,New York,New York,04/15/20,7837,561
54885,Los Angeles,California,07/05/20,7198,30
369053,New York,New York,04/04/20,6147,696
498166,Bexar,Texas,07/16/20,5980,28
369060,New York,New York,04/11/20,5924,714


# CA Data Exploration

In [188]:
#Top daily increases
stategrp.loc[stategrp['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[0:4]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
5,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
1237,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
845,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
61,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0


In [187]:
#Top daily increases in CA counties
us_daily.loc[us_daily['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:4]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
54885,Los Angeles,California,07/05/20,7198,30
54909,Los Angeles,California,07/29/20,4814,92
54896,Los Angeles,California,07/16/20,4471,46
54894,Los Angeles,California,07/14/20,4219,73


In [189]:
#counties on 7/29
us_daily.loc[(us_daily['Province_State']=='California') & (us_daily['Date']=='07/29/20'),['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']].sort_values(by='Daily Confirmed Cases', ascending=False)[0:4]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
54909,Los Angeles,California,07/29/20,4814,92
58139,San Bernardino,California,07/29/20,2347,24
57569,Riverside,California,07/29/20,972,11
58329,San Diego,California,07/29/20,780,19


In [70]:
CACBSAdaily = us_daily.loc[(us_daily['Province_State']=='California')].groupby(['Province_State','CBSA Title','Date'], as_index=False)['Daily Confirmed Cases','Daily Dead'].sum()
CACBSAdaily.sort_values(['Date','Daily Confirmed Cases'], ascending = [False,False]).reset_index(drop=True)

Unnamed: 0,Province_State,CBSA Title,Date,Daily Confirmed Cases,Daily Dead
0,California,"Los Angeles-Long Beach-Anaheim, CA",07/29/20,5253,98
1,California,"Riverside-San Bernardino-Ontario, CA",07/29/20,3319,35
2,California,"San Francisco-Oakland-Berkeley, CA",07/29/20,838,4
3,California,"San Diego-Chula Vista-Carlsbad, CA",07/29/20,780,19
4,California,"Bakersfield, CA",07/29/20,448,0
...,...,...,...,...,...
6455,California,"Truckee-Grass Valley, CA",01/22/20,0,0
6456,California,"Ukiah, CA",01/22/20,0,0
6457,California,"Vallejo, CA",01/22/20,0,0
6458,California,"Visalia, CA",01/22/20,0,0


# APPENDIX

In [None]:
url3 = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url3,error_bad_lines = False)

df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})
df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']] = df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']].fillna(0)

to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    for state in states:
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])]
                
fill_func(df_testing['Province_State'].unique())
df_testing = df_testing.replace({'Province_State':states})
df_testing['Date'] = pd.to_datetime(df_testing['Date'], format='%Y%m%d').apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))



In [None]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [None]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''

In [None]:
#California and Texas total confirmed cases differ by 2000+. Stick to COVIDTESTING data for testing dataframe
totalgrp = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
comparison = df_testing.loc[df_testing['Date']=='07/27/20', ['Province_State','Date','positive']].merge(totalgrp.loc[totalgrp['Date']=='07/27/20',['Province_State','Date','Total Confirmed Cases']], on=['Province_State','Date'], how = 'left', suffixes = (False, False))
comparison['Delta'] = comparison['positive'] - comparison['Total Confirmed Cases']
comparison