In [1]:
%run GeoInfo_functions.ipynb

In [2]:
import os
import datetime as dt
from sodapy import Socrata
from functions import Import_VDH_COVID_Data, Update_VDH_COVID_Data

# **<font color = 'DarkRed'> County Level </font>**

## **<font color='Black'> Obtain COVID data </font>**

## **<font color='Black'> Obtain COVID data from the Virginia Health Department Website </font>**

<font color='red'>First check if the json file "VDH-COVID-19-PublicUseDataset-Cases.json" exists.<br>
If it exists and is current then read the file.<br>Otherwise, import the data from the Virginia Health Department Website and save as a json file</font>

In [3]:
def Get_VA_COVID_data(ByPass_Update = False):
    VDH_filename = 'data/VDH-COVID-19-PublicUseDataset-Cases.json'
    
    if os.path.isfile(VDH_filename):
        print('Yay! VDH file exists')
        # check if file is current
        filetime = dt.datetime.fromtimestamp(os.path.getmtime(VDH_filename))
        print("VDH file was last modified on ", filetime.date())
        today = dt.datetime.now().date()
        print("Today's date is: ", today)
        if filetime.date() == today:
            print('Yay, VDH file is current!')
            COVID_data_Virginia = pd.read_json(VDH_filename, orient = 'table')
        else:
            print('VDH file is not current')
            if ByPass_Update:
                print('Reading existing file without updating')
                COVID_data_Virginia = pd.read_json(VDH_filename, orient = 'table')
            else:
                print('Updating VDH file from the Virginia Health Department Website')
                COVID_data_Virginia = pd.read_json(VDH_filename, orient = 'table')
                COVID_data_Virginia.sort_values(by = 'report_date', ignore_index = True, inplace = True)
                sdate = pd.to_datetime(COVID_data_Virginia['report_date'].iloc[-1]) + dt.timedelta(days = 1)
                edate = today + dt.timedelta(days = 1)
                missing_dates = pd.date_range(sdate, edate - dt.timedelta(days = 1), freq = 'd')
                dates_list = []
                for d in missing_dates:
                    date = d.strftime('%Y-%m-%dT%X') + '.000'
                    dates_list.append(date)
                    
                print(dates_list)
                df = Update_VDH_COVID_Data(dates_list)
                COVID_data_Virginia = pd.concat([COVID_data_Virginia, df], ignore_index = True)
                COVID_data_Virginia.to_json(VDH_filename, orient = 'table')
    else:
        print('VDH file does not exist')
        print('Importing VDH file from the Virginia Health Department Website')
        COVID_data_Virginia = Import_VDH_COVID_Data()
        COVID_data_Virginia.to_json(VDH_filename, orient = 'table')
        
    COVID_data_Virginia['report_date'] = pd.to_datetime(COVID_data_Virginia['report_date']).dt.strftime('%y/%m/%d')
    COVID_data_Virginia['County Code'] = COVID_data_Virginia['fips'].astype(str).str[2:]
    COVID_data_Virginia['fips'] = COVID_data_Virginia['fips'].astype(int)

    cols = ['total_cases', 'hospitalizations', 'deaths']
    COVID_data_Virginia[cols] = COVID_data_Virginia[cols].apply(pd.to_numeric, errors='coerce')

    # The Thomas Jefferson health district was later renamed Blue Ridge
    COVID_data_Virginia['vdh_health_district'] = np.where(COVID_data_Virginia['vdh_health_district'] == 'Thomas Jefferson',
                                                     'Blue Ridge', COVID_data_Virginia['vdh_health_district'])

    # At once instance there is an error where "c" was entered for the vdh_health_district instead of Rappahannock Rapidan
    COVID_data_Virginia['vdh_health_district'] = np.where(COVID_data_Virginia['vdh_health_district'] == 'c',
                                                     'Rappahannock Rapidan', COVID_data_Virginia['vdh_health_district'])
    
    COVID_data_Virginia['locality'] = [(Get_CountyNames_Dict('51'))[x] for x in COVID_data_Virginia['County Code']]
    
    COVID_data_Virginia.columns = ['Report Date', 'fips', 'County Name', 'Health District', 
                                   'Total Cases', 'Hospitalizations', 'Deaths', 'County Code']
    
    return COVID_data_Virginia

### <font color='red'> Correct for negative data after completion of quality assurance by the state's Department of Health </font>
* <font color='royalblue'> _Sometimes the total cases, hospitalization, or death decreases after the DH conducts quality assurance to ensure:<br>
1) cases are not assigned to the wrong locality as some ZIP codes cross between localities <br>
2) multiple positive test results for the same infection in one person are not counted as multiple COVID-19 cases <br>
3) the case follows the criteria outlined in a national case surveillance definition by the CDC._ </font>

In [4]:
def Clean_COVID_data(df, col_list):
    VDH_filename = 'data/VDH-COVID-19-PublicUseDataset-Cases_cleaned.json'
    df.sort_values(by = ['County Code', 'Report Date'], inplace = True)
    date0 = df.iloc[0]['Report Date']
    df.set_index(['County Code', 'Report Date'], inplace = True)
    df.sort_index(inplace = True)
    
    def Get_Dates_of_Adjusment(Mask):
        try:
            Adjusted_dates = (df[Mask].index).tolist()
        except KeyError:
            Adjusted_dates = []
        return Adjusted_dates
    
    for col in col_list:
        new_col = 'New Daily ' + col.replace('Total ', '')
        df[new_col] = df.groupby('County Code')[col].diff().fillna(0)

        print('Cleaning', col, 'data')
        Adjusted_dates = Get_Dates_of_Adjusment(df[new_col] < 0)
        iter = 0
        for d in Adjusted_dates:
            Max_cases = df.loc[d, col]
            Mask = df[col] > Max_cases
            Ind = df[Mask].loc[(d[0], date0) : d].index
            df.loc[Ind, col] = Max_cases
            iter += 1
            
        print(iter, ' entries adjusted')
        df[new_col] = df.groupby('County Code')[col].diff().fillna(0).astype('int')
    
    df.reset_index(inplace = True)
    df.to_json(VDH_filename, orient = 'table')
    
    return df

In [5]:
def Get_Per_Pop(State_Code, scope, df, col):
    if scope == 'County':
        Pop_df = Get_Counties_Pop19(State_Code)
    elif scope == 'Congressional District':
        Pop_df = Get_Districts_Pop19(State_Code)
        
    return df.loc[:, col]/Pop_df.loc[:, 'Population']

In [6]:
def Add_Pop_data(State_Code, level, df, col_list):
#     print(col_list)
    if level == 'County':
        Pop_df = Get_Counties_Pop19(State_Code)
        df.set_index('County Code', inplace = True)
    elif level == 'Congressional District':
        Pop_df = Get_Districts_Pop19(State_Code)
        df.set_index('CDistrict', inplace = True)
    
    df.sort_index(inplace = True)
    for col1 in col_list:
        col2 = col1 + ' per Pop'
        df[col2] = df.loc[:, col1]/Pop_df.loc[:, 'Population']
        
    df.reset_index(inplace = True)
    return df

In [7]:
def Get_Cleaned_VA_COVID_data(ByPass_Update = False):
    VDH_filename = 'data/VDH-COVID-19-PublicUseDataset-Cases_cleaned.json'
    
    if os.path.isfile(VDH_filename):
        print('Yay! cleaned VDH file exists')
        # check if file is current
        filetime = dt.datetime.fromtimestamp(os.path.getmtime(VDH_filename))
        print("cleaned VDH file was last modified on ", filetime.date())
        today = dt.datetime.now().date()
        print("Today's date is: ", today)
        if filetime.date() == today:
            print('Yay, cleaned VDH file is current!')
            COVID_data_Virginia = pd.read_json(VDH_filename, orient = 'table')
        else:
            print('cleaned VDH file is not current')
            if ByPass_Update:
                print('Reading existing file without updating')
                COVID_data_Virginia = pd.read_json(VDH_filename, orient = 'table')
            else:
                df = Get_VA_COVID_data(ByPass_Update)
                COVID_data_Virginia = Clean_COVID_data(df, ['Total Cases', 'Hospitalizations', 'Deaths'])
    else:
        print('cleaned VDH file does not exist')
        df = Get_VA_COVID_data(ByPass_Update)
        COVID_data_Virginia = Clean_COVID_data(df, ['Total Cases', 'Hospitalizations', 'Deaths'])
        
    COVID_data_Virginia['County Code'] = COVID_data_Virginia['County Code'].astype(str)
    
    COVID_data_Virginia['Hospitalizations Ratio'] = (COVID_data_Virginia['Hospitalizations'].
                                                     divide(COVID_data_Virginia['Total Cases'])).replace(np.nan, 0)
    COVID_data_Virginia['Deaths Ratio'] = (COVID_data_Virginia['Deaths'].
                                           divide(COVID_data_Virginia['Total Cases'])).replace(np.nan, 0)
    COVID_data_Virginia['Deaths per Hospitalizations'] = (COVID_data_Virginia['Deaths'].
                                                          divide(COVID_data_Virginia['Hospitalizations'])
                                                         ).replace(np.nan, 0)
        
    return COVID_data_Virginia

In [8]:
def Get_CongressionalDistricts_COVID_data(State_Code, df_counties):
    CD_Pop = Get_District_County_Pop19(State_Code)
    CD_Pop.reset_index(inplace = True)
    col_list = df_counties.loc[:, 'New Daily Cases':'Deaths per Hospitalizations'].columns.to_list()
    df = pd.merge(CD_Pop.drop(columns = ['County Name']), 
                  df_counties.drop(columns = col_list), 
                  how = 'inner', on = 'County Code')

    df.set_index(['County Code', 'CDistrict'], inplace = True)
    
    df.loc[:, 'Total Cases in District'] = (df.loc[:, 'Total Cases'].
                                            multiply(df['Ratio in District'], axis = 'index')).round().astype(int)    
    df.loc[:, 'Hospitalizations in District'] = (df.loc[:, 'Hospitalizations'].
                                                 multiply(df['Ratio in District'], axis = 'index')).round().astype(int)
    df.loc[:, 'Deaths in District'] = (df.loc[:, 'Deaths'].
                                       multiply(df['Ratio in District'], axis = 'index')).round().astype(int)
    
    df['isMax'] = (df.groupby(['County Code', 'Report Date'])['Population'].transform(max) == df['Population']).astype(int)
    df['Diff Cases'] = (df.groupby(['County Code', 'Report Date'])['Total Cases in District'].
                        transform(sum) - df['Total Cases'])
    df['Diff Hosp'] = (df.groupby(['County Code', 'Report Date'])['Hospitalizations in District'].
                       transform(sum) - df['Hospitalizations'])
    df['Diff Deaths'] = (df.groupby(['County Code', 'Report Date'])['Deaths in District'].
                         transform(sum) - df['Deaths'])
    
    df['Total Cases in District'] -= df['isMax']*df['Diff Cases']
    df['Hospitalizations in District'] -= df['isMax']*df['Diff Hosp']
    df['Deaths in District'] -= df['isMax']*df['Diff Deaths']
    
    df['Diff Cases'] = (df.groupby(['County Code', 'Report Date'])['Total Cases in District'].
                        transform(sum) - df['Total Cases'])
    df['Diff Hosp'] = (df.groupby(['County Code', 'Report Date'])['Hospitalizations in District'].
                       transform(sum) - df['Hospitalizations'])
    df['Diff Deaths'] = (df.groupby(['County Code', 'Report Date'])['Deaths in District'].
                         transform(sum) - df['Deaths'])

    df_districts = df.groupby(['CDistrict', 'Report Date']
                             )[['Total Cases in District', 'Hospitalizations in District', 'Deaths in District']].sum()
    
    df_districts.columns = ['Total Cases', 'Hospitalizations', 'Deaths']
    df_districts['New Daily Cases'] = df_districts.groupby('CDistrict')['Total Cases'].diff().fillna(0).astype('int')
    
    df_districts['New Daily Hospitalizations'] = (df_districts.groupby('CDistrict')['Hospitalizations'].
                                                  diff().fillna(0).astype('int'))
    
    df_districts['New Daily Deaths'] = df_districts.groupby('CDistrict')['Deaths'].diff().fillna(0).astype('int')
    
    df_districts['Hospitalizations Ratio'] = (df_districts['Hospitalizations'].
                                              divide(df_districts['Total Cases'])).replace(np.nan, 0)
    df_districts['Deaths Ratio'] = (df_districts['Deaths'].divide(df_districts['Total Cases'])).replace(np.nan, 0)
    df_districts['Deaths per Hospitalizations'] = (df_districts['Deaths'].
                                                   divide(df_districts['Hospitalizations'])).replace(np.nan, 0)
    df_districts.reset_index(inplace = True)
    
    return df_districts 

# <font color='red'>=============================================================</font>