In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

pd.options.display.max_columns = None

In [2]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_usconf = pd.read_csv(url,error_bad_lines = False)
df_usdead = pd.read_csv(url2,error_bad_lines = False)

df_counties = pd.read_csv(r'X:\AC\Documents\Datasets\US Census and OMB Data\2020 Counties UID State CBSA CSA.csv', delimiter = ',', encoding = "ISO-8859-1")
df_counties = df_counties[['UID', 'CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [3]:
#Add CBSA/CSA titles and codes to df
df_usconf = df_usconf.merge(df_counties, on = 'UID', how = 'left', suffixes=(False,False))
df_usconf = df_usconf.merge(df_usdead[['UID','Population']], on = 'UID', how = 'left', suffixes=(False,False))
cols = df_usconf.columns.tolist()
df_usconf = df_usconf[cols[0:1] + cols[4:6] + cols[8:10] + cols[-1:] + cols[6:8] + cols[-5:-1] + cols[11:-5]]

#convert date columns into a single column
df_usconf_pivot = df_usconf.melt(id_vars = df_usconf.columns[:12], var_name = 'Date', value_name = 'Total Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_pivot = df_usdead.melt(id_vars = df_usdead.columns[:12], var_name = 'Date', value_name = 'Total Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

#df for daily increments
df_usconf_daily = df_usconf.copy()
df_usdead_daily = df_usdead.copy()
df_usconf_daily.iloc[:,12:] = df_usconf_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')
df_usdead_daily.iloc[:,12:] = df_usdead_daily.iloc[:,12:].diff(axis=1).fillna(0).astype('int')

#convert date columns into a single column
df_usconf_daily = df_usconf_daily.melt(id_vars = df_usconf_daily.columns[:12], var_name = 'Date', value_name = 'Daily Confirmed Cases').sort_values(by = ['UID','Date']).reset_index(drop=True)
df_usdead_daily = df_usdead_daily.melt(id_vars = df_usdead_daily.columns[:12], var_name = 'Date', value_name = 'Daily Dead').sort_values(by = ['UID','Date']).reset_index(drop=True)

In [4]:
us_totals = df_usconf_pivot.merge(df_usdead_pivot[['UID','Date','Total Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_totals['Date'] = pd.to_datetime(us_totals['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_totals = us_totals.sort_values(by = ['UID','Date']).reset_index(drop=True)

us_daily = df_usconf_daily.merge(df_usdead_daily[['UID','Date','Daily Dead']], on = ['UID','Date'], how = 'left', suffixes = (False, False))
us_daily['Date'] = pd.to_datetime(us_daily['Date']).apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))
us_daily = us_daily.sort_values(by = ['UID','Date']).reset_index(drop=True)

In [5]:
statedaily = us_daily.loc[~us_daily['Province_State'].isin(['Diamond Princess','Grand Princess'])].groupby(['Province_State','Date'], as_index=False)['Population','Daily Confirmed Cases'].sum()
statedaily = statedaily.sort_values(['Date','Province_State'], ascending = [False,True]).reset_index(drop=True)
statecumsum = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
stategrp = pd.merge(statedaily, statecumsum, on = ['Province_State','Date'], how = 'left', suffixes = (False, False))

In [6]:
#Feature creation, rankings
stategrp['Total Cases per 1000 capita'] = stategrp['Total Confirmed Cases']/stategrp['Population']*1000

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Total Cases Daily Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

dates = stategrp['Date'].unique().tolist()
ustemp = pd.DataFrame()
for state in stategrp['Province_State'].unique():
    statetemp = stategrp.loc[stategrp['Province_State']==state].copy().reset_index(drop=True)
    yest = pd.Series(statetemp.loc[statetemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    statetemp = pd.concat([statetemp,yest], axis=1)
    ustemp = pd.concat([ustemp, statetemp])

stategrp = ustemp.sort_values(['Date','Total Cases Daily Ranking'], ascending = [False,True]).reset_index(drop=True)
stategrp['Total Cases Ranking Daily Change'] = stategrp['Total Cases Ranking Daily Change'] - stategrp['Total Cases Daily Ranking']

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Daily Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Daily Cases Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

ranks = []
for date in stategrp['Date'].unique():
    for ranking in stategrp.copy().loc[stategrp['Date']==date,'Total Cases per 1000 capita'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Cases per Capita Ranking': ranks})
stategrp = pd.concat([stategrp, rankings], axis=1)

In [7]:
conf_total = us_daily['Daily Confirmed Cases'].sum()
conf_dead = us_daily['Daily Dead'].sum()
US_pop = 329943320 #as of 1/1/20


print('US Stats As of: ', us_totals['Date'].sort_values(ascending = True).to_list()[-1])
print('Total Confirmed Cases To Date: ', conf_total)
print('Confirmed Cases Percentage of US population: %.2f' %((conf_total/ US_pop) * 100),'%')
print('\n')
print('Total Deaths To Date: ', (conf_dead))
print('Confirmed Cases Percentage of US population: %.2f' %((conf_dead/ US_pop) * 100),'%')
print('Percentage of deaths from confirmed cases : %.2f' %((conf_dead/ US_pop)/(conf_total/ US_pop)*100),'%')
print('\n')

US Stats As of:  08/06/20
Total Confirmed Cases To Date:  4883581
Confirmed Cases Percentage of US population: 1.48 %


Total Deaths To Date:  160104
Confirmed Cases Percentage of US population: 0.05 %
Percentage of deaths from confirmed cases : 3.28 %




In [8]:
last14 = us_daily['Date'].values.tolist()[-14:]
prev2weekavg = us_daily.loc[(us_daily['Date'].isin(us_daily['Date'].values.tolist()[-15:-1]))].groupby('Date')['Daily Confirmed Cases'].sum().mean()
yestsum = us_daily.loc[(us_daily['Date']==us_daily['Date'].unique()[-1])]['Daily Confirmed Cases'].sum()

print('For Yesterday ({}):'.format(us_daily['Date'].unique()[-1]))
print('Increase in Total Confirmed Cases: ', yestsum)
print('Percentage increase from average of last two weeks: ', ((yestsum - prev2weekavg)/prev2weekavg * 100), '%')
print('\n')
print('Last 14 days:')
print('Total Confirmed Cases: ', us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum())
print('Average Cases per day: %.1f' %prev2weekavg)
print('Cases in Last 14 days as Percentage of Total Cases: %.2f' %((us_daily.loc[(us_daily['Date'].isin(last14))]['Daily Confirmed Cases'].sum()/(conf_total))*100),'%')

For Yesterday (08/06/20):
Increase in Total Confirmed Cases:  59692
Percentage increase from average of last two weeks:  -2.1177859584969703 %


Last 14 days:
Total Confirmed Cases:  844766
Average Cases per day: 60983.5
Cases in Last 14 days as Percentage of Total Cases: 17.30 %


In [9]:
us_daily.to_csv(r'X:\AC\Documents\Datasets\US_daily_pivot.csv')
us_totals.to_csv(r'X:\AC\Documents\Datasets\US_totals_pivot.csv')

# Data Exploration

In [10]:
stategrp[0:9]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
0,California,08/06/20,39512223,10733,541339,10021,13.700545,1.0,0.0,1.0,23.0
1,Florida,08/06/20,21477737,7650,510389,7747,23.76363,2.0,0.0,2.0,3.0
2,Texas,08/06/20,28995881,6921,483920,8569,16.689267,3.0,0.0,3.0,14.0
3,New York,08/06/20,26161672,703,418928,32756,16.013044,4.0,0.0,26.0,15.0
4,Georgia,08/06/20,10617423,3182,204895,4026,19.297997,5.0,0.0,4.0,7.0
5,Illinois,08/06/20,12671821,1953,189705,7791,14.970619,6.0,0.0,7.0,20.0
6,New Jersey,08/06/20,8882190,374,183701,15849,20.681949,7.0,0.0,32.0,5.0
7,Arizona,08/06/20,7278717,1453,183656,4002,25.231919,8.0,0.0,9.0,2.0
8,North Carolina,08/06/20,10488084,2069,131802,2126,12.566833,9.0,0.0,6.0,26.0


In [11]:
#Top daily increases by state
stategrp.sort_values(by='Daily Confirmed Cases', ascending = False)[0:9]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
1402,Florida,07/12/20,21477737,15300,269811,4242,12.562357,3.0,0.0,1.0,10.0
1179,Texas,07/16/20,28995881,14962,305854,4265,10.548188,4.0,0.0,1.0,19.0
448,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
1178,Florida,07/16/20,21477737,13965,315775,4677,14.702434,3.0,0.0,2.0,8.0
1681,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
1289,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
504,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0
1346,Florida,07/13/20,21477737,12624,282435,4277,13.150128,3.0,0.0,1.0,10.0
843,Texas,07/22/20,28995881,12544,363615,5172,12.540229,4.0,0.0,1.0,19.0


In [12]:
#top daily increases of cases per capita
stategrp.assign(x = stategrp['Daily Confirmed Cases']/stategrp['Population']).sort_values(by='x', ascending = False).drop('x',axis=1)[0:9]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
627,Louisiana,07/26/20,4648794,3840,107574,3763,23.140195,12.0,0.0,3.0,1.0
121,Louisiana,08/04/20,4648794,3615,124461,4051,26.77275,10.0,0.0,4.0,1.0
233,Louisiana,08/02/20,4648794,3467,119747,4007,25.758724,10.0,2.0,3.0,1.0
5826,Massachusetts,04/24/20,6892503,4973,51700,2556,7.500904,3.0,0.0,2.0,3.0
1402,Florida,07/12/20,21477737,15300,269811,4242,12.562357,3.0,0.0,1.0,10.0
963,Louisiana,07/20/20,4648794,3186,94892,3574,20.412176,12.0,0.0,4.0,1.0
1019,Louisiana,07/19/20,4648794,3116,91706,3543,19.726837,12.0,0.0,5.0,3.0
2025,Arizona,07/01/20,7278717,4877,84105,1725,11.554921,10.0,0.0,4.0,9.0
1178,Florida,07/16/20,21477737,13965,315775,4677,14.702434,3.0,0.0,2.0,8.0


In [13]:
#Top daily increases by county
us_daily.sort_values(by=['Daily Confirmed Cases'], ascending = False)[['Admin2','Province_State','Date','Daily Confirmed Cases','Daily Dead']][0:5]

Unnamed: 0,Admin2,Province_State,Date,Daily Confirmed Cases,Daily Dead
384600,New York,New York,04/15/20,7837,561
57189,Los Angeles,California,07/05/20,7198,30
384589,New York,New York,04/04/20,6147,696
519134,Bexar,Texas,07/16/20,5980,28
384596,New York,New York,04/11/20,5924,714


# CA Data Exploration

In [14]:
us_daily.loc[(us_daily['Province_State']=='California')&(us_daily['Admin2']=='San Francisco'),['Date','Daily Confirmed Cases']].tail()

Unnamed: 0,Date,Daily Confirmed Cases
60979,08/02/20,88
60980,08/03/20,105
60981,08/04/20,73
60982,08/05/20,92
60983,08/06/20,147


In [15]:
#Top daily increases for CA
stategrp.loc[stategrp['Province_State']=='California'].sort_values(by=['Daily Confirmed Cases'], ascending = False)[0:4]

Unnamed: 0,Province_State,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change,Daily Cases Ranking,Cases per Capita Ranking
448,California,07/29/20,39512223,14151,484913,8908,12.272481,1.0,0.0,1.0,23.0
1681,California,07/07/20,39512223,12977,284012,6573,7.187953,2.0,0.0,1.0,30.0
1289,California,07/14/20,39512223,12854,346211,7250,8.762124,2.0,0.0,1.0,24.0
504,California,07/28/20,39512223,12641,470762,8679,11.914339,1.0,0.0,1.0,24.0


In [16]:
#Create df for CA's CBSAs
cacbsa_daily = us_daily.loc[(us_daily['Province_State']=='California')].groupby(['Province_State','CBSA Title','Date'], as_index=False)['Population','Daily Confirmed Cases'].sum()
cacbsa_daily = cacbsa_daily.sort_values(['Date','Daily Confirmed Cases'], ascending = [False,False]).reset_index(drop=True)
cacbsa_cumsum = us_totals.loc[us_totals['Province_State']=='California'].groupby(['Province_State','CBSA Title','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
cacbsa = pd.merge(cacbsa_daily, cacbsa_cumsum, on = ['Province_State','CBSA Title','Date'], how = 'left', suffixes = (False, False))

In [17]:
#Feature creation, rankings
cacbsa['Total Cases per 1000 capita'] = cacbsa['Total Confirmed Cases']/cacbsa['Population']*1000

ranks = []
for date in cacbsa['Date'].unique():
    for ranking in cacbsa.copy().loc[cacbsa['Date']==date,'Total Confirmed Cases'].rank(ascending = False, method='min'):
        ranks.append(ranking)
rankings = pd.DataFrame({'Total Cases Daily Ranking': ranks})
cacbsa = pd.concat([cacbsa, rankings], axis=1)

dates = cacbsa['Date'].unique().tolist()
cacbsatemp = pd.DataFrame()
for state in cacbsa['CBSA Title'].unique():
    cbsatemp = cacbsa.loc[cacbsa['CBSA Title']==state].copy().reset_index(drop=True)
    yest = pd.Series(cbsatemp.loc[cbsatemp['Date'].isin(dates[1:]),'Total Cases Daily Ranking'].reset_index(drop=True)).rename('Total Cases Ranking Daily Change')
    cbsatemp = pd.concat([cbsatemp,yest], axis=1)
    cacbsatemp = pd.concat([cacbsatemp, cbsatemp])
    
cacbsa = cacbsatemp.sort_values(['Date','Total Cases Daily Ranking'], ascending = [False,True]).reset_index(drop=True)
cacbsa['Total Cases Ranking Daily Change'] = cacbsa['Total Cases Ranking Daily Change'] - cacbsa['Total Cases Daily Ranking']

In [20]:
cacbsa.sort_values('Daily Confirmed Cases', ascending = False)

Unnamed: 0,Province_State,CBSA Title,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change
1088,California,"Los Angeles-Long Beach-Anaheim, CA",07/05/20,13214799,7861,131878,3853,9.979569,1.0,0.0
714,California,"Los Angeles-Long Beach-Anaheim, CA",07/16/20,13214799,5344,175582,4455,13.286770,1.0,0.0
272,California,"Los Angeles-Long Beach-Anaheim, CA",07/29/20,13214799,5253,218728,5105,16.551746,1.0,0.0
1020,California,"Los Angeles-Long Beach-Anaheim, CA",07/07/20,13214799,5204,139656,3951,10.568152,1.0,0.0
782,California,"Los Angeles-Long Beach-Anaheim, CA",07/14/20,13214799,5084,166696,4330,12.614342,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1371,California,"Santa Maria-Santa Barbara, CA",06/27/20,446499,-30,2712,28,6.073922,12.0,0.0
1519,California,"Yuba City, CA",06/23/20,175639,-36,168,4,0.956507,24.0,0.0
2155,California,"Stockton, CA",06/04/20,762148,-56,1024,36,1.343571,14.0,0.0
1541,California,"Stockton, CA",06/22/20,762148,-119,2400,48,3.148995,12.0,-1.0


In [22]:
cacbsa.loc[cacbsa['Date']==cacbsa['Date'][0]][0:5]

Unnamed: 0,Province_State,CBSA Title,Date,Population,Daily Confirmed Cases,Total Confirmed Cases,Total Dead,Total Cases per 1000 capita,Total Cases Daily Ranking,Total Cases Ranking Daily Change
0,California,"Los Angeles-Long Beach-Anaheim, CA",08/06/20,13214799,3615,239911,5566,18.154722,1.0,0.0
1,California,"Riverside-San Bernardino-Ontario, CA",08/06/20,4650631,1652,74376,1260,15.992669,2.0,0.0
2,California,"San Francisco-Oakland-Berkeley, CA",08/06/20,4731803,1255,39734,601,8.397222,3.0,0.0
3,California,"San Diego-Chula Vista-Carlsbad, CA",08/06/20,3338330,611,31127,583,9.324123,4.0,0.0
4,California,"Bakersfield, CA",08/06/20,900202,291,21724,152,24.132361,5.0,0.0


In [34]:
#Most recent day in SF Oakland Berkeley CBSA
us_daily.loc[(us_daily['CBSA Title']=='San Francisco-Oakland-Berkeley, CA')&(us_daily['Date']==us_daily['Date'].max())][['CBSA Title','Admin2','Date','Population','Daily Confirmed Cases']]

Unnamed: 0,CBSA Title,Admin2,Date,Population,Daily Confirmed Cases
53657,"San Francisco-Oakland-Berkeley, CA",Alameda,08/06/20,1671329,748
54845,"San Francisco-Oakland-Berkeley, CA",Contra Costa,08/06/20,1153526,208
60983,"San Francisco-Oakland-Berkeley, CA",San Francisco,08/06/20,881549,147
61577,"San Francisco-Oakland-Berkeley, CA",San Mateo,08/06/20,766573,133
57617,"San Francisco-Oakland-Berkeley, CA",Marin,08/06/20,258826,19


In [61]:
#county totals summary for CA
us_daily.loc[us_daily['Province_State']=='California'].groupby(['Admin2'])[['Population','Daily Confirmed Cases', 'Daily Dead']].agg({'Population':'mean','Daily Confirmed Cases':'sum', 'Daily Dead':'sum'}).sort_values('Daily Confirmed Cases', ascending = False)[0:4]

Unnamed: 0_level_0,Population,Daily Confirmed Cases,Daily Dead
Admin2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Los Angeles,10039107,201200,4869
Riverside,2470546,39741,770
Orange,3175692,38711,697
San Bernardino,2180085,34635,490


In [46]:
us_daily['Date'].unique()[-30:]

array(['07/08/20', '07/09/20', '07/10/20', '07/11/20', '07/12/20',
       '07/13/20', '07/14/20', '07/15/20', '07/16/20', '07/17/20',
       '07/18/20', '07/19/20', '07/20/20', '07/21/20', '07/22/20',
       '07/23/20', '07/24/20', '07/25/20', '07/26/20', '07/27/20',
       '07/28/20', '07/29/20', '07/30/20', '07/31/20', '08/01/20',
       '08/02/20', '08/03/20', '08/04/20', '08/05/20', '08/06/20'],
      dtype=object)

In [48]:
sf_lastmonth = us_daily.loc[(us_daily['Admin2']=='San Francisco')&(us_daily['Date'].isin(us_daily['Date'].unique()[-30:]))]

In [49]:
#3208 cases in SF county in the last month
sf_lastmonth['Daily Confirmed Cases'].sum()

3208

In [52]:
#US census estimate for SF county population in 2019
sf_lastmonth['Population'].values[0]

881549

In [53]:
# .4 cases per 100 capita. Less than one in 100 are actively transmissible
sf_lastmonth['Daily Confirmed Cases'].sum()/sf_lastmonth['Population'].values[0]*100

0.3639048992171734

In [55]:
# .82 cases per 100 capita. Less than one in 100 have tested positive
us_daily.loc[(us_daily['Admin2']=='San Francisco')]['Daily Confirmed Cases'].sum()/sf_lastmonth['Population'].values[0]*100

0.8199203901314617

# APPENDIX

In [None]:
url3 = 'https://covidtracking.com/api/v1/states/daily.csv'
df_testing = pd.read_csv(url3,error_bad_lines = False)

df_testing = df_testing.rename(columns = {'date':'Date','state':'Province_State'})
df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']] = df_testing[['totalTestResultsIncrease','positiveIncrease','negativeIncrease','totalTestResults','positive','negative']].fillna(0)

to_fix = ['totalTestResultsIncrease','positiveIncrease','negativeIncrease']
actual = ['totalTestResults','positive','negative']
columns = df_testing.columns
def fill_func(states):
    for state in states:
        for col in range(0,len(to_fix)):
            cumsum = df_testing.loc[df_testing['Province_State']==state].sort_values(by='Date')[to_fix[col]].cumsum()
            actualsum = df_testing.loc[df_testing['Province_State']==state,['Date',actual[col]]].sort_values(by='Date')[actual[col]]
            comparison = cumsum.eq(actualsum)
            if comparison[comparison == False].count() > 0:
                Earliest_index = comparison[comparison == False].index[0]
                df_testing.iloc[Earliest_index,columns.get_loc(to_fix[col])] = df_testing.iloc[Earliest_index,columns.get_loc(actual[col])]
                
fill_func(df_testing['Province_State'].unique())
df_testing = df_testing.replace({'Province_State':states})
df_testing['Date'] = pd.to_datetime(df_testing['Date'], format='%Y%m%d').apply(lambda x: pd.datetime.strftime(x, '%m/%d/%y'))



In [None]:
#missing CBSA codes after merge: 41980, 10380, 38660, 11640, 41900, 49500, 32420, 25020, 27580, 17620, 17640, 42180
#They're all in Puerto Rico. Span across multiple CBSA and CSA. Should just ignore
res = Counter(df_counties['CBSA Code'].value_counts().to_dict()) - Counter(df_usconf['CBSA Code'].value_counts().to_dict())
pd.set_option('display.max_rows', None)
df_counties[df_counties['CBSA Code'].isin(list(res.keys()))]

In [None]:
df_usconf.loc[df_usconf['UID']==630,['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']] = [41980,'San Juan-Bayamón-Caguas, PR', 490.0, 'San Juan-Bayamón, PR']
df_usconf.loc[df_usconf['UID']==630][['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']]

In [None]:
columns_to_remove = ['iso2', 'iso3', 'code3', 'Combined_Key']
df_usconf = df_usconf[Counter(df_usconf.columns.tolist()) - Counter(columns_to_remove)]
df_usdead = df_usdead[Counter(df_usdead.columns.tolist()) - Counter(columns_to_remove)]

In [None]:
#Add additional column for CBSAs and CSAs before melting?

'''BA_counties = ['Alameda','Contra Costa','Marin','Napa','San Francisco','San Mateo','Santa Clara','Solano','Sonoma']
LA_counties =['Ventura','San Bernadio', 'Riverside', 'Los Angeles', 'Orange']'''

In [None]:
#California and Texas total confirmed cases differ by 2000+. Stick to COVIDTESTING data for testing dataframe
totalgrp = us_totals.groupby(['Province_State','Date'], as_index=False)['Total Confirmed Cases','Total Dead'].sum()
comparison = df_testing.loc[df_testing['Date']=='07/27/20', ['Province_State','Date','positive']].merge(totalgrp.loc[totalgrp['Date']=='07/27/20',['Province_State','Date','Total Confirmed Cases']], on=['Province_State','Date'], how = 'left', suffixes = (False, False))
comparison['Delta'] = comparison['positive'] - comparison['Total Confirmed Cases']
comparison