In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [3]:
#Add CBSA/CSA titles and codes to df
df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

#convert date columns into a single column
df_usconf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

#df for daily increments
df_usconf_daily = df_usconf.copy()
df_usdead_daily = df_usdead.copy()
df_usconf_daily.iloc[:,12:] = df_usconf_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
df_usdead_daily.iloc[:,12:] = df_usdead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

#convert date columns into a single column
df_usconf_daily = df_usconf_daily.melt(id_vars = df_usconf_daily.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_daily = df_usdead_daily.melt(id_vars = df_usdead_daily.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

In [4]:
us_totals = df_usconf_pivot.merge(df_usdead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = df_usconf_daily.merge(df_usdead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [5]:
last14 = us_daily['Date'].values.tolist()[-14:]
conf_total = us_daily['Daily Confirmed Cases'].sum()
conf_dead = us_daily['Daily Dead'].sum()
US_pop = 329943320 #as of 1/1/20


print('US Stats As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', conf_total)
print('Confirmed Cases Percentage of US population: %.2f' %((conf_total/ US_pop) * 100),'%')
print('\n')
print('Total Deaths To Date: ', (conf_dead))
print('Confirmed Cases Percentage of US population: %.2f' %((conf_dead/ US_pop) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %((conf_dead/ US_pop)/(conf_total/ US_pop)*100),'%')
print('\n')
print('Since Yesterday:')
print('Increase in Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())

print('\n')
print('Last 14 days:')
print('Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())
print('Cases in Last 14 days as Percentage of Total Cases: %.2f' %((us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum()/(conf_total))*100),'%')

US Stats As of:  07/30/20
Total Confirmed Cases To Date:  4495014
Confirmed Cases Percentage of US population: 1.36 %


Total Deaths To Date:  152055
Confirmed Cases Percentage of US population: 0.05 %
Percentage of deaths from confirmed cases : 3.38 %


Last 14 days:
Total Confirmed Cases:  918858
Cases in Last 14 days as Percentage of Total Cases: 20.44 %


In [6]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [7]:
statedaily = us_daily.loc[~us_daily['Province_State'].isin(['Diamond Princess','Grand Princess'])].groupby(['Province_State','Date'], as_index=False)['Population','Daily Confirmed Cases'].sum()
statedaily = statedaily.sort_values(['Date','Province_State'], ascending = [False,True]).reset_index(drop=True)
statecumsum = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
stategrp = pd.merge(statedaily, statecumsum, on = ['Province_State','Date'], how = 'left', suffixes = (False, False))

In [8]:
#Feature creation, rankings
stategrp['Total Cases per 1000 capita'] = stategrp['Total Confirmed Cases']/stategrp['Population']*1000

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Total Cases Daily Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

dates = stategrp['Date'].unique().tolist()
ustemp = pd.DataFrame()
for state in stategrp['Province_State'].unique():
    statetemp = stategrp.loc[stategrp['Province_State']==state].copy().reset_index(drop=True)
    yest = pd.Series(statetemp.loc[statetemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    statetemp = pd.concat([statetemp,yest], axis=1)
    ustemp = pd.concat([ustemp, statetemp])

stategrp = ustemp.sort_values(['Date','Total Cases Daily Ranking'], ascending = [False,True]).reset_index(drop=True)
stategrp['Total Cases Ranking Daily Change'] = stategrp['Total Cases Ranking Daily Change'] - stategrp['Total Cases Daily Ranking']

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Daily Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Daily Cases Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Cases per 1000 capita'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Cases per Capita Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

In [9]:
stategrp[0:25]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
0,California,07/30/20,39512223,8021,492934,9026,12.475481,1.0,0.0,3.0,23.0
1,Florida,07/30/20,21477737,9956,461379,6586,21.481732,2.0,0.0,1.0,3.0
2,Texas,07/30/20,28995881,9234,428229,6442,14.768615,3.0,0.0,2.0,16.0
3,New York,07/30/20,26161672,777,414370,32683,15.838819,4.0,0.0,25.0,12.0
4,Georgia,07/30/20,10617423,3963,182286,3671,17.168573,5.0,1.0,4.0,8.0
5,New Jersey,07/30/20,8882190,370,180970,15809,20.37448,6.0,-1.0,35.0,4.0
6,Illinois,07/30/20,12671821,1772,178135,7670,14.057569,7.0,0.0,11.0,18.0
7,Arizona,07/30/20,7278717,2525,170798,3626,23.465399,8.0,0.0,5.0,2.0
8,North Carolina,07/30/20,10488084,2145,120532,1922,11.49228,9.0,0.0,7.0,25.0
9,Massachusetts,07/30/20,6892503,414,117098,8580,16.989184,10.0,0.0,32.0,11.0


In [10]:
#Top daily increases by state
stategrp.sort_values(by='Daily Confirmed Cases', ascending = False)[0:9]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
1010,Florida,07/12/20,21477737,15300,269811,4242,12.562357,3.0,0.0,1.0,10.0
787,Texas,07/16/20,28995881,14962,305854,3657,10.548188,4.0,0.0,1.0,19.0
56,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
786,Florida,07/16/20,21477737,13965,315775,4677,14.702434,3.0,0.0,2.0,8.0
1289,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
897,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
112,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0
954,Florida,07/13/20,21477737,12624,282435,4277,13.150128,3.0,0.0,1.0,10.0
451,Texas,07/22/20,28995881,12544,363615,4453,12.540229,4.0,0.0,1.0,19.0


In [11]:
#Top daily increases by county
us_daily.sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:5]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
371006,New York,New York,04/15/20,7837,561
55173,Los Angeles,California,07/05/20,7198,30
370995,New York,New York,04/04/20,6147,696
500787,Bexar,Texas,07/16/20,5980,28
371002,New York,New York,04/11/20,5924,714


# CA Data Exploration

In [15]:
us_daily.loc[(us_daily['Province_State']=='California')&(us_daily['Admin2']=='San Francisco'),['Date','Daily Confirmed Cases']].tail()

Unnamed: 0,Date,Daily Confirmed Cases
58823,07/26/20,145
58824,07/27/20,90
58825,07/28/20,43
58826,07/29/20,132
58827,07/30/20,226


In [16]:
us_daily.loc[(us_daily['Province_State']=='California')&(us_daily['Admin2']=='San Francisco'),['Date','Daily Confirmed Cases']].describe()

Unnamed: 0,Daily Confirmed Cases
count,191.0
mean,33.628272
std,41.038147
min,-18.0
25%,0.0
50%,23.0
75%,48.0
max,226.0


In [213]:
stategrp.loc[stategrp['Province_State']=='California'].describe()

Unnamed: 0,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
count,190.0,190.0,190.0,190.0,190.0,190.0,189.0,190.0,190.0
mean,39512223.0,2552.173684,97759.842105,2625.536842,2.474167,2.921053,0.005291,2.873684,22.4
std,0.0,3217.239677,127598.172994,2790.400526,3.229334,1.546144,0.334177,2.52078,12.754831
min,39512223.0,0.0,0.0,0.0,0.0,1.0,-2.0,1.0,1.0
25%,39512223.0,17.5,111.75,1.25,0.002828,2.0,0.0,1.0,7.0
50%,39512223.0,1448.0,42963.5,1707.0,1.087347,3.0,0.0,2.0,30.0
75%,39512223.0,3231.25,145854.0,4967.0,3.691364,4.0,0.0,3.0,32.0
max,39512223.0,14151.0,484913.0,8908.0,12.272481,6.0,2.0,18.0,34.0


In [188]:
#Top daily increases
stategrp.loc[stategrp['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[0:4]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
5,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
1237,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
845,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
61,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0


In [201]:
#Create df for CA's CBSAs
cacbsa_daily = us_daily.loc[(us_daily['Province_State']=='California')].groupby(['Province_State','CBSA Title','Date'], as_index=False)['Population','Daily Confirmed Cases'].sum()
cacbsa_daily = cacbsa_daily.sort_values(['Date','Daily Confirmed Cases'], ascending = [False,False]).reset_index(drop=True)
cacbsa_cumsum = us_totals.loc[us_totals['Province_State']=='California'].groupby(['Province_State','CBSA Title','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
cacbsa = pd.merge(cacbsa_daily, cacbsa_cumsum, on = ['Province_State','CBSA Title','Date'], how = 'left', suffixes = (False, False))

In [202]:
#Feature creation, rankings
cacbsa['Total Cases per 1000 capita'] = cacbsa['Total Confirmed Cases']/cacbsa['Population']*1000

ranks = []
for date in cacbsa['Date'].unique():
    for ranking in cacbsa.copy().loc[cacbsa['Date']==date,'Total Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Total Cases Daily Ranking': ranks})
cacbsa = pd.concat([cacbsa, rankings], axis=1)

dates = cacbsa['Date'].unique().tolist()
cacbsatemp = pd.DataFrame()
for state in cacbsa['CBSA Title'].unique():
    cbsatemp = cacbsa.loc[cacbsa['CBSA Title']==state].copy().reset_index(drop=True)
    yest = pd.Series(cbsatemp.loc[cbsatemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    cbsatemp = pd.concat([cbsatemp,yest], axis=1)
    cacbsatemp = pd.concat([cacbsatemp, cbsatemp])
    
cacbsa = cacbsatemp.sort_values(['Date','Total Cases Daily Ranking'], ascending = [False,True]).reset_index(drop=True)
cacbsa['Total Cases Ranking Daily Change'] = cacbsa['Total Cases Ranking Daily Change'] - cacbsa['Total Cases Daily Ranking']

In [206]:
cacbsa.loc[(cacbsa['CBSA Title']=='Los Angeles-Long Beach-Anaheim, CA')]

Unnamed: 0,Province_State,CBSA Title,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change
0,California,"Los Angeles-Long Beach-Anaheim, CA",07/29/20,13214799,5253,218728,5105,16.551746,1.0,0.0
34,California,"Los Angeles-Long Beach-Anaheim, CA",07/28/20,13214799,2801,213475,5007,16.154237,1.0,0.0
68,California,"Los Angeles-Long Beach-Anaheim, CA",07/27/20,13214799,2284,210674,4941,15.942278,1.0,0.0
102,California,"Los Angeles-Long Beach-Anaheim, CA",07/26/20,13214799,2112,208390,4924,15.769442,1.0,0.0
136,California,"Los Angeles-Long Beach-Anaheim, CA",07/25/20,13214799,3985,206278,4913,15.609621,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
6290,California,"Los Angeles-Long Beach-Anaheim, CA",01/26/20,13214799,2,2,0,0.000151,1.0,0.0
6324,California,"Los Angeles-Long Beach-Anaheim, CA",01/25/20,13214799,0,0,0,0.000000,1.0,0.0
6358,California,"Los Angeles-Long Beach-Anaheim, CA",01/24/20,13214799,0,0,0,0.000000,1.0,0.0
6392,California,"Los Angeles-Long Beach-Anaheim, CA",01/23/20,13214799,0,0,0,0.000000,1.0,0.0


In [208]:
cacbsa.loc[cacbsa['Date']==cacbsa['Date'][0]]

Unnamed: 0,Province_State,CBSA Title,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change
0,California,"Los Angeles-Long Beach-Anaheim, CA",07/29/20,13214799,5253,218728,5105,16.551746,1.0,0.0
1,California,"Riverside-San Bernardino-Ontario, CA",07/29/20,4650631,3319,68468,1101,14.722303,2.0,0.0
2,California,"San Francisco-Oakland-Berkeley, CA",07/29/20,4731803,838,34807,526,7.35597,3.0,0.0
3,California,"San Diego-Chula Vista-Carlsbad, CA",07/29/20,3338330,780,28287,552,8.473398,4.0,0.0
4,California,"Bakersfield, CA",07/29/20,900202,448,17344,135,19.266787,5.0,0.0
5,California,"Sacramento-Roseville-Folsom, CA",07/29/20,2363730,198,13370,183,5.656314,6.0,1.0
6,California,"Fresno, CA",07/29/20,999101,127,13336,120,13.348,7.0,-1.0
7,California,"Stockton, CA",07/29/20,762148,286,11209,146,14.707117,8.0,0.0
8,California,"San Jose-Sunnyvale-Santa Clara, CA",07/29/20,1990660,445,10213,191,5.130459,9.0,0.0
9,California,"El Centro, CA",07/29/20,181215,52,9241,188,50.994675,10.0,0.0


In [214]:
cacbsa.loc[cacbsa['CBSA Title']=='San Francisco-Oakland-Berkeley, CA'].describe()

Unnamed: 0,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change
count,190.0,190.0,190.0,190.0,190.0,190.0,189.0
mean,4731803.0,183.194737,7996.242105,174.757895,1.689893,2.889474,-0.010582
std,0.0,212.696443,9380.230627,169.899299,1.98238,0.983135,0.371733
min,4731803.0,0.0,0.0,0.0,0.0,1.0,-2.0
25%,4731803.0,6.5,25.25,0.0,0.005336,2.0,0.0
50%,4731803.0,119.5,4883.0,152.5,1.031953,3.0,0.0
75%,4731803.0,223.5,12137.5,311.5,2.56509,3.0,0.0
max,4731803.0,875.0,34807.0,526.0,7.35597,6.0,3.0


In [187]:
#Top daily increases in CA counties
us_daily.loc[us_daily['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:4]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
54885,Los Angeles,California,07/05/20,7198,30
54909,Los Angeles,California,07/29/20,4814,92
54896,Los Angeles,California,07/16/20,4471,46
54894,Los Angeles,California,07/14/20,4219,73


In [189]:
#counties on 7/29
us_daily.loc[(us_daily['Province_State']=='California') & (us_daily['Date']=='07/29/20'),['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']].sort_values(by='Daily Confirmed Cases', ascending=False)[0:4]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
54909,Los Angeles,California,07/29/20,4814,92
58139,San Bernardino,California,07/29/20,2347,24
57569,Riverside,California,07/29/20,972,11
58329,San Diego,California,07/29/20,780,19


# APPENDIX

In [None]:
url3 = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url3,error_bad_lines = False)

df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})
df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']] = df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']].fillna(0)

to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    for state in states:
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])]
                
fill_func(df_testing['Province_State'].unique())
df_testing = df_testing.replace({'Province_State':states})
df_testing['Date'] = pd.to_datetime(df_testing['Date'], format='%Y%m%d').apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))



In [None]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [None]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''

In [None]:
#California and Texas total confirmed cases differ by 2000+. Stick to COVIDTESTING data for testing dataframe
totalgrp = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
comparison = df_testing.loc[df_testing['Date']=='07/27/20', ['Province_State','Date','positive']].merge(totalgrp.loc[totalgrp['Date']=='07/27/20',['Province_State','Date','Total Confirmed Cases']], on=['Province_State','Date'], how = 'left', suffixes = (False, False))
comparison['Delta'] = comparison['positive'] - comparison['Total Confirmed Cases']
comparison