In [1]:
import os
import pandas as pd
import numpy as np
import time

# Load data set

In [2]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

In [3]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            dtype={"RegionName": str,
                                   "RegionCode": str},
                            error_bad_lines=False)
    latest_df["RegionName"] = latest_df["RegionName"].fillna("")
    return latest_df

In [4]:
latest_df = load_dataset(LATEST_DATA_URL)

In [5]:
latest_df.sample(3)

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,...,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
52246,Panama,PAN,,,NAT_TOTAL,2020-05-02,3.0,1.0,3.0,1.0,...,93.52,93.52,100.0,100.0,77.38,77.38,79.86,79.86,62.5,62.5
40110,Kyrgyz Republic,KGZ,,,NAT_TOTAL,2020-09-27,2.0,1.0,2.0,0.0,...,67.13,67.13,66.67,66.67,61.61,61.61,63.54,63.54,50.0,50.0
11073,Brazil,BRA,Minas Gerais,BR_MG,STATE_TOTAL,2020-04-27,3.0,1.0,2.0,1.0,...,81.94,81.94,91.67,91.67,,,77.78,77.78,,


# Get NPIs

In [6]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing',
               'H6_Facial Coverings']

In [7]:
npis_df = latest_df[["CountryName", "RegionName", "Date"] + NPI_COLUMNS]

In [8]:
npis_df.sample(3)

Unnamed: 0,CountryName,RegionName,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings
39186,Kazakhstan,,2020-01-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2030,Argentina,,2020-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
56826,Saudi Arabia,,2020-02-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0


# Dates

In [9]:
# start_date_str = "2020-08-01"
# end_date_str = "2020-08-04"
start_date_str = "2020-05-07"
end_date_str = "2020-05-20"

In [10]:
start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')

In [11]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]
actual_npis_df.sample(3)

Unnamed: 0,CountryName,RegionName,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings
1132,Albania,,2020-05-16,3.0,2.0,2.0,4.0,2.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
78159,United States,New Jersey,2020-05-19,3.0,2.0,2.0,4.0,1.0,2.0,1.0,3.0,2.0,3.0,1.0,2.0
11425,Brazil,Mato Grosso do Sul,2020-05-17,3.0,2.0,2.0,4.0,1.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0


# Get actual cases between these dates

In [12]:
NUM_PREV_DAYS_TO_INCLUDE = 6
WINDOW_SIZE = 7

In [13]:
def get_actual_cases(df, start_date, end_date):
    # 1 day earlier to compute the daily diff
    start_date_for_diff = start_date - pd.offsets.Day(WINDOW_SIZE)
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    # Filter out the data set to include all the data needed to compute the diff
    actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]
    # Add GeoID column that combines CountryName and RegionName for easier manipulation of data
    # np.where usage: if A then B else C
    actual_df["GeoID"] = np.where(actual_df["RegionName"].isnull(),
                                  actual_df["CountryName"],
                                  actual_df["CountryName"] + ' / ' + actual_df["RegionName"])
    actual_df.sort_values(by=["GeoID","Date"], inplace=True)
    # Compute the diff
    actual_df["ActualDailyNewCases"] = actual_df.groupby("GeoID")["ConfirmedCases"].diff().fillna(0)
    # Compute the 7 day moving average
    actual_df["ActualDailyNewCases7DMA"] = actual_df.groupby(
        "GeoID")['ActualDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)
    return actual_df

In [14]:
actual_df = get_actual_cases(latest_df, start_date, end_date)

In [15]:
actual_df.head(12)

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,GeoID,ActualDailyNewCases,ActualDailyNewCases7DMA
452,Afghanistan,,2020-04-30,1949.0,Afghanistan /,0.0,
453,Afghanistan,,2020-05-01,2171.0,Afghanistan /,222.0,
454,Afghanistan,,2020-05-02,2335.0,Afghanistan /,164.0,
455,Afghanistan,,2020-05-03,2469.0,Afghanistan /,134.0,
456,Afghanistan,,2020-05-04,2704.0,Afghanistan /,235.0,
457,Afghanistan,,2020-05-05,2894.0,Afghanistan /,190.0,
458,Afghanistan,,2020-05-06,3224.0,Afghanistan /,330.0,182.142857
459,Afghanistan,,2020-05-07,3392.0,Afghanistan /,168.0,206.142857
460,Afghanistan,,2020-05-08,3563.0,Afghanistan /,171.0,198.857143
461,Afghanistan,,2020-05-09,3778.0,Afghanistan /,215.0,206.142857


# Get historical data for 7 days moving average calculation
In order to compute the 7 days moving average, we need to get the historical true new cases for the last 7 days before start date

In [16]:
ma_df = actual_df[actual_df["Date"] < start_date]
ma_df = ma_df[["CountryName", "RegionName", "Date", "ActualDailyNewCases"]]
ma_df = ma_df.rename(columns={"ActualDailyNewCases": "PredictedDailyNewCases"})
ma_df.head()

Unnamed: 0,CountryName,RegionName,Date,PredictedDailyNewCases
452,Afghanistan,,2020-04-30,0.0
453,Afghanistan,,2020-05-01,222.0
454,Afghanistan,,2020-05-02,164.0
455,Afghanistan,,2020-05-03,134.0
456,Afghanistan,,2020-05-04,235.0


# Run the predictions
Evaluate some example submissions.  
__NOTE: Please run the corresponding example notebooks first in order to train the models that are used in this section.__

In [17]:
IP_FILE = "covid_xprize/validation/data/2020-09-30_historical_ip.csv"
predictions = {}

## Linear

In [None]:
# Check a model has been trained
if not os.path.isfile("covid_xprize/examples/predictors/linear/models/model.pkl"):
    print("ERROR: Please run the notebook in 'covid_xprize/examples/predictors/linear' in order to train a model!")

In [None]:
linear_output_file = "covid_xprize/examples/predictors/linear/predictions/val_4_days.csv"

In [None]:
!python covid_xprize/examples/predictors/linear/predict.py -s {start_date_str} -e {end_date_str} -ip {IP_FILE} -o {linear_output_file}

In [None]:
predictions["Linear"] = linear_output_file

## LSTM

In [None]:
# Check a model has been trained
#if not os.path.isfile("covid_xprize/examples/predictors/lstm/models/trained_model_weights.h5"):
#    print("ERROR: Please run the notebook in 'covid_xprize/examples/predictors/lstm' in order to train a model!")

LSTM_MODEL_FILE = "covid_xprize/examples/predictors/lstm/models/lstm_2020-05-06.h5"
if not os.path.isfile(LSTM_MODEL_FILE):
    print("ERROR: Please run the notebook in 'covid_xprize/examples/predictors/lstm' in order to train a model!")

model_weights = "covid_xprize/examples/predictors/lstm/models/trained_model_weights.h5"     
!ln -sf {os.path.join(os.getcwd(), LSTM_MODEL_FILE)} {os.path.join(os.getcwd(), model_weights)}

In [18]:
#lstm_output_file = "covid_xprize/examples/predictors/lstm/predictions/val_4_days.csv"
lstm_output_file = "covid_xprize/examples/predictors/lstm/predictions/val_"+start_date_str+"_"+end_date_str+".csv"

In [None]:
t0 = time.time()
!python covid_xprize/examples/predictors/lstm/predict.py -s {start_date_str} -e {end_date_str} -ip {IP_FILE} -o {lstm_output_file}
elapsed_time = time.time() - t0
time.strftime("Predictions took %M:%S", time.gmtime(elapsed_time))

In [19]:
predictions["LSTM"] = lstm_output_file

## geoLSTM

In [None]:
geoLSTM_MODEL_FILE = "covid_xprize/examples/predictors/geolstm/models/geolstm_2020-05-06.h5"
if not os.path.isfile(geoLSTM_MODEL_FILE):
    print("ERROR: Please run the notebook in 'covid_xprize/examples/predictors/geolstm' in order to train a model!")

model_weights = "covid_xprize/examples/predictors/geolstm/models/trained_model_weights.h5"     
!ln -sf {os.path.join(os.getcwd(), geoLSTM_MODEL_FILE)} {os.path.join(os.getcwd(), model_weights)}

In [20]:
geolstm_output_file = "covid_xprize/examples/predictors/geolstm/predictions/val_geolstm_"+start_date_str+"_"+end_date_str+".csv"

In [None]:
t0 = time.time()
!python covid_xprize/examples/predictors/geolstm/predict.py -s {start_date_str} -e {end_date_str} -ip {IP_FILE} -o {geolstm_output_file}
elapsed_time = time.time() - t0
time.strftime("Predictions took %M:%S", time.gmtime(elapsed_time))

In [21]:
predictions["geoLSTM"] = geolstm_output_file

## tempGeoLSTM

In [None]:
tempGeoLSTM_MODEL_FILE = "covid_xprize/examples/predictors/tempgeolstm/models/tempgeolstm_2020-05-06.h5"
if not os.path.isfile(tempGeoLSTM_MODEL_FILE):
    print("ERROR: Please run the notebook in 'covid_xprize/examples/predictors/tempgeolstm' in order to train a model!")

model_weights = "covid_xprize/examples/predictors/tempgeolstm/models/trained_model_weights.h5"     
!ln -sf {os.path.join(os.getcwd(), tempGeoLSTM_MODEL_FILE)} {os.path.join(os.getcwd(), model_weights)}

In [22]:
tempgeolstm_output_file = "covid_xprize/examples/predictors/tempgeolstm/predictions/val_geolstm_"+start_date_str+"_"+end_date_str+".csv"

In [None]:
t0 = time.time()
!python covid_xprize/examples/predictors/tempgeolstm/predict.py -s {start_date_str} -e {end_date_str} -ip {IP_FILE} -o {tempgeolstm_output_file}
elapsed_time = time.time() - t0
time.strftime("Predictions took %M:%S", time.gmtime(elapsed_time))

In [23]:
predictions["tempGeoLSTM"] = tempgeolstm_output_file

# Get predictions from submissions

In [24]:
def get_predictions_from_file(predictor_name, predictions_file, ma_df):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["RegionName"] = preds_df["RegionName"].fillna("")
    preds_df["PredictorName"] = predictor_name
    preds_df["Prediction"] = True
    
    # Append the true number of cases before start date
    ma_df["PredictorName"] = predictor_name
    ma_df["Prediction"] = False
    preds_df = ma_df.append(preds_df, ignore_index=True)

    # Add GeoID column that combines CountryName and RegionName for easier manipulation of data
    # np.where usage: if A then B else C
    preds_df["GeoID"] = np.where(preds_df["RegionName"].isnull(),
                                 preds_df["CountryName"],
                                 preds_df["CountryName"] + ' / ' + preds_df["RegionName"])
    # Sort
    preds_df.sort_values(by=["GeoID","Date"], inplace=True)
    # Compute the 7 days moving average for PredictedDailyNewCases
    preds_df["PredictedDailyNewCases7DMA"] = preds_df.groupby(
        "GeoID")['PredictedDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)

    # Put PredictorName first
    preds_df = preds_df[["PredictorName"] + [col for col in preds_df.columns if col != "PredictorName"]]
    return preds_df

In [25]:
test_predictor_name = "Linear"
temp_df = get_predictions_from_file(test_predictor_name, predictions[test_predictor_name], ma_df.copy())
temp_df.head(12)

KeyError: 'Linear'

In [26]:
actual_df.head(8)

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,GeoID,ActualDailyNewCases,ActualDailyNewCases7DMA
452,Afghanistan,,2020-04-30,1949.0,Afghanistan /,0.0,
453,Afghanistan,,2020-05-01,2171.0,Afghanistan /,222.0,
454,Afghanistan,,2020-05-02,2335.0,Afghanistan /,164.0,
455,Afghanistan,,2020-05-03,2469.0,Afghanistan /,134.0,
456,Afghanistan,,2020-05-04,2704.0,Afghanistan /,235.0,
457,Afghanistan,,2020-05-05,2894.0,Afghanistan /,190.0,
458,Afghanistan,,2020-05-06,3224.0,Afghanistan /,330.0,182.142857
459,Afghanistan,,2020-05-07,3392.0,Afghanistan /,168.0,206.142857


In [27]:
from covid_xprize.validation.predictor_validation import validate_submission

ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    print(f"Getting {predictor_name}'s predictions from: {predictions_file}")
    errors = validate_submission(start_date_str, end_date_str, IP_FILE, predictions_file)
    if not errors:
        preds_df = get_predictions_from_file(predictor_name, predictions_file, ma_df)
        merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date', 'GeoID'], how='left')
        ranking_df = ranking_df.append(merged_df)
    else:
        print(f"Predictor {predictor_name} did not submit valid predictions! Please check its errors:")
        print(errors)

Getting LSTM's predictions from: covid_xprize/examples/predictors/lstm/predictions/val_2020-05-07_2020-05-20.csv
Getting geoLSTM's predictions from: covid_xprize/examples/predictors/geolstm/predictions/val_geolstm_2020-05-07_2020-05-20.csv
Getting tempGeoLSTM's predictions from: covid_xprize/examples/predictors/tempgeolstm/predictions/val_geolstm_2020-05-07_2020-05-20.csv


In [28]:
ranking_df['DiffDaily'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [29]:
ranking_df['Diff7DMA'] = (ranking_df["ActualDailyNewCases7DMA"] - ranking_df["PredictedDailyNewCases7DMA"]).abs()

In [30]:
# Compute the cumulative sum of 7DMA errors
ranking_df['CumulDiff7DMA'] = ranking_df.groupby(["GeoID", "PredictorName"])['Diff7DMA'].cumsum()

In [31]:
# Keep only predictions (either Prediction == True) or on or after start_date
ranking_df = ranking_df[ranking_df["Date"] >= start_date]

In [32]:
# Sort by 7 days moving average diff
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff7DMA"], inplace=True)

In [33]:
ranking_df.head(4*2)

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,GeoID,ActualDailyNewCases,ActualDailyNewCases7DMA,PredictorName,PredictedDailyNewCases,Prediction,PredictedDailyNewCases7DMA,DiffDaily,Diff7DMA,CumulDiff7DMA
7,Afghanistan,,2020-05-07,3392.0,Afghanistan /,168.0,206.142857,LSTM,174.593546,True,207.084792,6.593546,0.941935,0.941935
7,Afghanistan,,2020-05-07,3392.0,Afghanistan /,168.0,206.142857,geoLSTM,196.922925,True,210.274704,28.922925,4.131846,4.131846
7,Afghanistan,,2020-05-07,3392.0,Afghanistan /,168.0,206.142857,tempGeoLSTM,0.0,True,182.142857,168.0,24.0,24.0
8,Afghanistan,,2020-05-08,3563.0,Afghanistan /,171.0,198.857143,LSTM,264.197474,True,213.113003,93.197474,14.25586,15.197795
8,Afghanistan,,2020-05-08,3563.0,Afghanistan /,171.0,198.857143,geoLSTM,309.327944,True,222.750124,138.327944,23.892981,28.024828
8,Afghanistan,,2020-05-08,3563.0,Afghanistan /,171.0,198.857143,tempGeoLSTM,0.0,True,150.428571,171.0,48.428571,72.428571
9,Afghanistan,,2020-05-09,3778.0,Afghanistan /,215.0,206.142857,LSTM,197.179111,True,217.852876,17.820889,11.710019,26.907814
9,Afghanistan,,2020-05-09,3778.0,Afghanistan /,215.0,206.142857,geoLSTM,139.69082,True,219.277384,75.30918,13.134527,41.159355


In [34]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-02')]

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,GeoID,ActualDailyNewCases,ActualDailyNewCases7DMA,PredictorName,PredictedDailyNewCases,Prediction,PredictedDailyNewCases7DMA,DiffDaily,Diff7DMA,CumulDiff7DMA


In [35]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [36]:
ranking_df.groupby('PredictorName').Diff7DMA.sum().sort_values()

PredictorName
geoLSTM        2.821157e+05
LSTM           3.008707e+05
tempGeoLSTM    1.216442e+06
Name: Diff7DMA, dtype: float64

## Countries

In [37]:
countries_ranking_df = ranking_df.groupby(["CountryName", "RegionName", "PredictorName"])[["CountryName", "RegionName", "PredictorName", "Diff7DMA"]].sum().sort_values(by=["CountryName", "RegionName", "Diff7DMA"])


In [38]:
countries_ranking_df.head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Diff7DMA
CountryName,RegionName,PredictorName,Unnamed: 3_level_1
Afghanistan,,geoLSTM,778.096144
Afghanistan,,LSTM,827.622561
Afghanistan,,tempGeoLSTM,3170.0
Albania,,LSTM,25.239115
Albania,,tempGeoLSTM,27.678302
Albania,,geoLSTM,33.802741
Algeria,,LSTM,191.200801
Algeria,,geoLSTM,404.371846
Algeria,,tempGeoLSTM,1982.285714
Andorra,,LSTM,2.425602


## Specific country

In [39]:
cr_df = countries_ranking_df.reset_index()

In [40]:
cr_df[(cr_df.CountryName == "Italy") & (cr_df.RegionName == "")]

Unnamed: 0,CountryName,RegionName,PredictorName,Diff7DMA
243,Italy,,LSTM,3760.249984
244,Italy,,geoLSTM,4649.462916
245,Italy,,tempGeoLSTM,11539.142857


In [41]:
ranking_df[ranking_df.CountryName == "Italy"]

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,GeoID,ActualDailyNewCases,ActualDailyNewCases7DMA,PredictorName,PredictedDailyNewCases,Prediction,PredictedDailyNewCases7DMA,DiffDaily,Diff7DMA,CumulDiff7DMA
2317,Italy,,2020-05-07,214457.0,Italy /,1444.0,1552.285714,geoLSTM,1590.960609,True,1573.280087,146.960609,20.994373,20.994373
2317,Italy,,2020-05-07,214457.0,Italy /,1444.0,1552.285714,LSTM,1723.044154,True,1592.149165,279.044154,39.863451,39.863451
2317,Italy,,2020-05-07,214457.0,Italy /,1444.0,1552.285714,tempGeoLSTM,0.0,True,1346.0,1444.0,206.285714,206.285714
2318,Italy,,2020-05-08,215858.0,Italy /,1401.0,1485.0,geoLSTM,1579.398137,True,1531.479821,178.398137,46.479821,67.474194
2318,Italy,,2020-05-08,215858.0,Italy /,1401.0,1485.0,LSTM,1565.332961,True,1548.339588,164.332961,63.339588,103.203038
2318,Italy,,2020-05-08,215858.0,Italy /,1401.0,1485.0,tempGeoLSTM,0.0,True,1078.571429,1401.0,406.428571,612.714286
2319,Italy,,2020-05-09,217185.0,Italy /,1327.0,1393.857143,geoLSTM,1814.049123,True,1509.91541,487.049123,116.058267,183.532461
2319,Italy,,2020-05-09,217185.0,Italy /,1327.0,1393.857143,LSTM,1701.03865,True,1510.630824,374.03865,116.773681,219.976719
2319,Italy,,2020-05-09,217185.0,Italy /,1327.0,1393.857143,tempGeoLSTM,0.0,True,797.857143,1327.0,596.0,1208.714286
2320,Italy,,2020-05-10,218268.0,Italy /,1083.0,1277.142857,LSTM,1667.703372,True,1477.445591,584.703372,200.302734,420.279453


## Specific country (group by)

In [42]:
ranking_df[(ranking_df.CountryName == "United States") & (ranking_df.RegionName == "")].groupby(["PredictorName"]).Diff7DMA.sum().sort_values()

PredictorName
geoLSTM         10704.685283
LSTM            11905.894076
tempGeoLSTM    258930.571429
Name: Diff7DMA, dtype: float64

## Specific region

In [43]:
cr_df[(cr_df.CountryName == "United States") & (cr_df.RegionName == "California")]

Unnamed: 0,CountryName,RegionName,PredictorName,Diff7DMA
534,United States,California,LSTM,1064.275534
535,United States,California,geoLSTM,1566.138828
536,United States,California,tempGeoLSTM,19634.714286


## Continent

In [44]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [45]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")].groupby('PredictorName').Diff7DMA.sum().sort_values().reset_index()

Unnamed: 0,PredictorName,Diff7DMA
0,geoLSTM,20714.702524
1,LSTM,26559.779795
2,tempGeoLSTM,300178.428571


In [46]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")]

Unnamed: 0,CountryName,RegionName,PredictorName,Diff7DMA
87,Canada,,geoLSTM,2195.034963
88,Canada,,LSTM,4787.491987
89,Canada,,tempGeoLSTM,13709.285714
315,Mexico,,geoLSTM,7814.982277
316,Mexico,,LSTM,9866.393732
317,Mexico,,tempGeoLSTM,27538.571429
519,United States,,geoLSTM,10704.685283
520,United States,,LSTM,11905.894076
521,United States,,tempGeoLSTM,258930.571429


## Regions with temperature data

In [65]:
with open('covid_xprize/examples/predictors/tempgeolstm/models/countries.txt', 'r') as f:
    countries_with_temp = [g.rstrip() for g in f.readlines()]
geoid_with_temp = [c+' / ' for c in countries_with_temp if ' / ' not in countries_with_temp]

In [66]:
cr_df[(cr_df.CountryName.isin(countries_with_temp)) & (cr_df.RegionName == "")].groupby('PredictorName').Diff7DMA.sum().sort_values().reset_index()

Unnamed: 0,PredictorName,Diff7DMA
0,tempGeoLSTM,18770.626019
1,LSTM,22247.027734
2,geoLSTM,23095.475907


In [67]:
cr_df[(cr_df.CountryName.isin(countries_with_temp)) & (cr_df.RegionName == "")]

Unnamed: 0,CountryName,RegionName,PredictorName,Diff7DMA
3,Albania,,LSTM,25.239115
4,Albania,,tempGeoLSTM,27.678302
5,Albania,,geoLSTM,33.802741
9,Andorra,,LSTM,2.425602
10,Andorra,,geoLSTM,3.768259
...,...,...,...,...
682,Uruguay,,LSTM,19.583854
683,Uruguay,,geoLSTM,19.828474
699,Zambia,,tempGeoLSTM,392.197670
700,Zambia,,LSTM,406.406212


# Plots

In [78]:
ALL_GEO = "Overall"
DEFAULT_GEO = ALL_GEO

## Prediction vs actual

In [79]:
predictor_names = list(ranking_df.PredictorName.dropna().unique())
# geoid_names = list(ranking_df.GeoID.unique())
geoid_names = geoid_with_temp

## Filter by country

In [80]:
all_df = ranking_df[ranking_df.GeoID.isin(geoid_names)].groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "PredictedDailyNewCases7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()
all_df

Unnamed: 0,PredictorName,Date,PredictedDailyNewCases7DMA
0,LSTM,2020-05-07,3526.858328
1,LSTM,2020-05-08,3478.604954
2,LSTM,2020-05-09,3434.547411
3,LSTM,2020-05-10,3391.238107
4,LSTM,2020-05-11,3349.557258
5,LSTM,2020-05-12,3308.208584
6,LSTM,2020-05-13,3267.340234
7,LSTM,2020-05-14,3226.408542
8,LSTM,2020-05-15,3186.98866
9,LSTM,2020-05-16,3147.8355


In [81]:
import plotly.graph_objects as go

fig = go.Figure(layout=dict(title=dict(text=f"{DEFAULT_GEO} Daily New Cases 7-day Average ",
                                       y=0.9,
                                       x=0.5,
                                       xanchor='center',
                                       yanchor='top'
                                       ),
                             plot_bgcolor='#f2f2f2',
                             xaxis_title="Date",
                             yaxis_title="Daily new cases 7-day average"
                             ))

# Keep track of trace visibility by geo ID name
geoid_plot_names = []

all_df = ranking_df[ranking_df.GeoID.isin(geoid_names)].groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "PredictedDailyNewCases7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()

# Add 1 trace per predictor, for all geos
for predictor_name in predictor_names:
    all_geo_df = all_df[all_df.PredictorName == predictor_name]
    fig.add_trace(go.Scatter(x=all_geo_df.Date,
                             y=all_geo_df.PredictedDailyNewCases7DMA,
                             name=predictor_name,
                             visible=(ALL_GEO == DEFAULT_GEO))
                 )
    geoid_plot_names.append(ALL_GEO)

# Add 1 trace per predictor, per geo id
for predictor_name in predictor_names:
    for geoid_name in geoid_names:
        pred_geoid_df = ranking_df[(ranking_df.GeoID == geoid_name) &
                                   (ranking_df.PredictorName == predictor_name)]
        fig.add_trace(go.Scatter(x=pred_geoid_df.Date,
                                 y=pred_geoid_df.PredictedDailyNewCases7DMA,
                                 name=predictor_name,
                                 visible=(geoid_name == DEFAULT_GEO))
                     )
        geoid_plot_names.append(geoid_name)

# For each geo
# Add 1 trace for the true number of cases
for geoid_name in geoid_names:
    geo_actual_df = actual_df[(actual_df.GeoID == geoid_name) &
                                  (actual_df.Date >= start_date)]
    fig.add_trace(go.Scatter(x=geo_actual_df.Date,
                             y=geo_actual_df.ActualDailyNewCases7DMA,
                             name="Ground Truth",
                             visible= (geoid_name == DEFAULT_GEO),
                             line=dict(color='orange', width=4, dash='dash'))
                  )
    geoid_plot_names.append(geoid_name)
    
# Add 1 trace for the overall ground truth
overall_actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.GeoID.isin(geoid_names))].groupby(["Date"])[["GeoID", "ActualDailyNewCases7DMA"]].sum(). \
    sort_values(by=["Date"]).reset_index()
fig.add_trace(go.Scatter(x=overall_actual_df.Date,
                         y=overall_actual_df.ActualDailyNewCases7DMA,
                         name="Ground Truth",
                         visible= (ALL_GEO == DEFAULT_GEO),
                         line=dict(color='orange', width=4, dash='dash'))
                  )
geoid_plot_names.append(geoid_name)

# Format x axis
fig.update_xaxes(
dtick="D1",  # Means 1 day
tickformat="%d\n%b")

# Filter
buttons=[]
for geoid_name in ([ALL_GEO] + geoid_names):
    buttons.append(dict(method='update',
                        label=geoid_name,
                        args = [{'visible': [geoid_name==r for r in geoid_plot_names]},
                                {'title': f"{geoid_name} Daily New Cases 7-day Average "}]))
fig.update_layout(showlegend=True,
                  updatemenus=[{"buttons": buttons,
                                "direction": "down",
                                "active": ([ALL_GEO] + geoid_names).index(DEFAULT_GEO),
                                "showactive": True,
                                "x": 0.1,
                                "y": 1.15}])

fig.show()

## Rankings: by cumulative 7DMA error

In [74]:
ranking_fig = go.Figure(layout=dict(title=dict(text=f'{DEFAULT_GEO} submission rankings',
                                               y=0.9,
                                               x=0.5,
                                               xanchor='center',
                                               yanchor='top'
                                               ),
                                    plot_bgcolor='#f2f2f2',
                                    xaxis_title="Date",
                                    yaxis_title="Cumulative 7DMA error"
                                    ))

# Keep track of trace visibility by geo name
ranking_geoid_plot_names = []

all_df = ranking_df[ranking_df.GeoID.isin(geoid_names)].groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "CumulDiff7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()

# Add 1 trace per predictor, for all geos
for predictor_name in predictor_names:
    ranking_geoid_df = all_df[all_df.PredictorName == predictor_name]
    ranking_fig.add_trace(go.Scatter(x=ranking_geoid_df.Date,
                             y=ranking_geoid_df.CumulDiff7DMA,
                             name=predictor_name,
                             visible=(ALL_GEO == DEFAULT_GEO))
                 )
    ranking_geoid_plot_names.append(ALL_GEO)


# Add 1 trace per predictor, per country
for predictor_name in predictor_names:
    for geoid_name in geoid_names:
        ranking_geoid_df = ranking_df[(ranking_df.GeoID == geoid_name) &
                                        (ranking_df.PredictorName == predictor_name)]
        ranking_fig.add_trace(go.Scatter(x=ranking_geoid_df.Date,
                                 y=ranking_geoid_df.CumulDiff7DMA,
                                 name=predictor_name,
                                 visible= (geoid_name == DEFAULT_GEO))
                     )
        ranking_geoid_plot_names.append(geoid_name)

# Format x axis
ranking_fig.update_xaxes(
dtick="D1",  # Means 1 day
tickformat="%d\n%b")

# Filter
buttons=[]
for geoid_name in ([ALL_GEO] + geoid_names):
    buttons.append(dict(method='update',
                        label=geoid_name,
                        args = [{'visible': [geoid_name==r for r in ranking_geoid_plot_names]},
                                {'title': f'{geoid_name} submission rankings'}]))
ranking_fig.update_layout(showlegend=True,
                          updatemenus=[{"buttons": buttons,
                                        "direction": "down",
                                        "active": ([ALL_GEO] + geoid_names).index(DEFAULT_GEO),
                                        "showactive": True,
                                        "x": 0.1,
                                        "y": 1.15}])

ranking_fig.show()