Checks whether the most recent dates are the same for each region.

In [122]:
import os
import pandas as pd
import numpy as np

In [123]:
current_dir = os.path.abspath('')  #./ongoing/prescriptors
ROOT_DIR =  os.path.abspath(os.path.join(current_dir, os.pardir))  # ./ongoing
DATA_PATH = os.path.join(ROOT_DIR, 'data')
DATA_FILE_PATH = os.path.join(DATA_PATH, 'OxCGRT_latest.csv')
ADDITIONAL_CONTEXT_FILE = os.path.join(DATA_PATH, "Additional_Context_Data_Global.csv")
ADDITIONAL_US_STATES_CONTEXT = os.path.join(DATA_PATH, "US_states_populations.csv")
ADDITIONAL_UK_CONTEXT = os.path.join(DATA_PATH, "uk_populations.csv")

In [124]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing',
               'H6_Facial Coverings']

CONTEXT_COLUMNS = ['CountryName',
                   'RegionName',
                   'GeoID',
                   'Date',
                   'ConfirmedCases',
                   'ConfirmedDeaths',
                   'Population']
NB_LOOKBACK_DAYS = 21
NB_TEST_DAYS = 14
WINDOW_SIZE = 7
US_PREFIX = "United States / "
# NUM_TRIALS = 1
NUM_TRIALS = 10
LSTM_SIZE = 32
MAX_NB_COUNTRIES = 20

In [125]:
# load data
df = pd.read_csv(DATA_FILE_PATH,
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
df["GeoID"] = np.where(df["RegionName"].isnull(),
                           df["CountryName"],
                           df["CountryName"] + ' / ' + df["RegionName"])
df

Unnamed: 0.1,Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C2_Workplace closing,...,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay,GeoID
0,0,Aruba,ABW,,,NAT_TOTAL,2020-01-01,0.0,,0.0,...,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,Aruba
1,1,Aruba,ABW,,,NAT_TOTAL,2020-01-02,0.0,,0.0,...,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,Aruba
2,2,Aruba,ABW,,,NAT_TOTAL,2020-01-03,0.0,,0.0,...,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,Aruba
3,3,Aruba,ABW,,,NAT_TOTAL,2020-01-04,0.0,,0.0,...,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,Aruba
4,4,Aruba,ABW,,,NAT_TOTAL,2020-01-05,0.0,,0.0,...,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,Aruba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111435,111435,Zimbabwe,ZWE,,,NAT_TOTAL,2021-01-28,2.0,1.0,3.0,...,84.26,85.71,85.71,65.0,65.0,71.15,71.15,25.0,25.0,Zimbabwe
111436,111436,Zimbabwe,ZWE,,,NAT_TOTAL,2021-01-29,2.0,1.0,3.0,...,84.26,85.71,85.71,65.0,65.0,71.15,71.15,25.0,25.0,Zimbabwe
111437,111437,Zimbabwe,ZWE,,,NAT_TOTAL,2021-01-30,2.0,1.0,3.0,...,84.26,85.71,85.71,65.0,65.0,71.15,71.15,25.0,25.0,Zimbabwe
111438,111438,Zimbabwe,ZWE,,,NAT_TOTAL,2021-01-31,2.0,1.0,3.0,...,84.26,85.71,85.71,65.0,65.0,71.15,71.15,25.0,25.0,Zimbabwe


In [126]:
# load additonal contexts data
# File containing the population for each country
# Note: this file contains only countries population, not regions
additional_context_df = pd.read_csv(ADDITIONAL_CONTEXT_FILE,
                                    usecols=['CountryName', 'Population'])

additional_context_df['GeoID'] = additional_context_df['CountryName']

# US states population
additional_us_states_df = pd.read_csv(ADDITIONAL_US_STATES_CONTEXT,
                                      usecols=['NAME', 'POPESTIMATE2019'])
# Rename the columns to match measures_df ones
additional_us_states_df.rename(columns={'POPESTIMATE2019': 'Population'}, inplace=True)
# Prefix with country name to match measures_df
additional_us_states_df['GeoID'] = US_PREFIX + additional_us_states_df['NAME']

# Append the new data to additional_df
additional_context_df = additional_context_df.append(additional_us_states_df)

# UK population
additional_uk_df = pd.read_csv(ADDITIONAL_UK_CONTEXT)
# Append the new data to additional_df
additional_context_df = additional_context_df.append(additional_uk_df)


In [127]:
additional_context_df

Unnamed: 0,CountryName,Population,GeoID,NAME
0,Afghanistan,38928346,Afghanistan,
1,Albania,2877797,Albania,
2,Algeria,43851044,Algeria,
3,Andorra,77265,Andorra,
4,Angola,32866272,Angola,
5,Argentina,45195774,Argentina,
6,Aruba,106766,Aruba,
7,Australia,25499884,Australia,
8,Austria,9006398,Austria,
9,Azerbaijan,10127874,Azerbaijan,


In [128]:
def fill_missing_values(df):
    """
    # Fill missing values by interpolation, ffill, and filling NaNs
    :param df: Dataframe to be filled
    """
    df.update(df.groupby('GeoID').ConfirmedCases.apply(
        lambda group: group.interpolate(limit_area='inside')))
    # Drop country / regions for which no number of cases is available
    df.dropna(subset=['ConfirmedCases'], inplace=True)
    df.update(df.groupby('GeoID').ConfirmedDeaths.apply(
        lambda group: group.interpolate(limit_area='inside')))
    # Drop country / regions for which no number of deaths is available
    df.dropna(subset=['ConfirmedDeaths'], inplace=True)
    for npi_column in NPI_COLUMNS:
        df.update(df.groupby('GeoID')[npi_column].ffill().fillna(0))
    return df

In [129]:
def prepare_dataframe(data, add_data) -> pd.DataFrame:
    """
    Loads the Oxford dataset, cleans it up and prepares the necessary columns. Depending on options, also
    loads the Johns Hopkins dataset and merges that in.
    :param data_url: the url containing the original data
    :return: a Pandas DataFrame with the historical data
    """
    # Original df from Oxford
    df1 = data

    # Additional context df (e.g Population for each country)
    df2 = add_data

    # Merge the 2 DataFrames
    df = df1.merge(df2, on=['GeoID'], how='left', suffixes=('', '_y'))

    # Drop countries with no population data
    df.dropna(subset=['Population'], inplace=True)

    #  Keep only needed columns
    columns = CONTEXT_COLUMNS + NPI_COLUMNS
    df = df[columns]

    # Fill in missing values
    df = fill_missing_values(df)
 
    # Compute number of new cases and deaths each day
    df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)
    df['NewDeaths'] = df.groupby('GeoID').ConfirmedDeaths.diff().fillna(0)

    # Replace negative values (which do not make sense for these columns) with 0
    df['NewCases'] = df['NewCases'].clip(lower=0)
    df['NewDeaths'] = df['NewDeaths'].clip(lower=0)

    # Compute smoothed versions of new cases and deaths each day
    df['SmoothNewCases'] = df.groupby('GeoID')['NewCases'].rolling(
        WINDOW_SIZE, center=False).mean().fillna(0).reset_index(0, drop=True)
    df['SmoothNewDeaths'] = df.groupby('GeoID')['NewDeaths'].rolling(
        WINDOW_SIZE, center=False).mean().fillna(0).reset_index(0, drop=True)

    # Compute percent change in new cases and deaths each day
    df['CaseRatio'] = df.groupby('GeoID').SmoothNewCases.pct_change(
    ).fillna(0).replace(np.inf, 0) + 1
    df['DeathRatio'] = df.groupby('GeoID').SmoothNewDeaths.pct_change(
    ).fillna(0).replace(np.inf, 0) + 1

    # Add column for proportion of population infected
    df['ProportionInfected'] = df['ConfirmedCases'] / df['Population']

    # Create column of value to predict
    df['PredictionRatio'] = df['CaseRatio'] / (1 - df['ProportionInfected'])

    return df

In [130]:
d = prepare_dataframe(df, additional_context_df)  # preprocess

In [131]:
pd.options.display.max_rows = 4000
d.groupby(["GeoID"])["Date"].max()

GeoID
Afghanistan                         2021-01-31
Albania                             2021-01-31
Algeria                             2021-01-31
Andorra                             2021-01-31
Angola                              2021-01-31
Argentina                           2021-01-31
Aruba                               2021-01-31
Australia                           2021-01-31
Austria                             2021-01-31
Azerbaijan                          2021-01-31
Bahamas                             2021-01-31
Bahrain                             2021-01-31
Bangladesh                          2021-01-31
Barbados                            2021-01-31
Belarus                             2021-01-31
Belgium                             2021-01-31
Belize                              2021-01-31
Benin                               2021-01-31
Bermuda                             2021-01-31
Bhutan                              2021-01-31
Bolivia                             2021-01-31
Bosnia 

In [132]:
d.groupby(["GeoID"])["Date"].max().min()  # get minimum of the maximium dates for each region

Timestamp('2021-01-31 00:00:00')